{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# initialize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from __future__ import nested_scopes\n",
    "from IPython.display import display, HTML\n",
    "display(HTML('<style>.container { width:100% !important; }</style>'))\n",
    "display(HTML('<style>.CodeMirror{font-family: \"Courier New\";font-size: 12pt;}</style>'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true,
    "lang": "en"
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "logger = logging.getLogger()\n",
    "logger.setLevel(logging.ERROR)\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import datetime\n",
    "from datetime import date\n",
    "import time\n",
    "import threading\n",
    "import gzip\n",
    "import json\n",
    "import math\n",
    "import re\n",
    "import html\n",
    "import builtins\n",
    "\n",
    "import collections\n",
    "import numpy\n",
    "import pandas\n",
    "pandas.options.display.max_rows=50\n",
    "pandas.options.display.max_columns=200\n",
    "pandas.options.display.float_format = '{:,}'.format\n",
    "\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as mtick\n",
    "import matplotlib.lines as mlines\n",
    "from matplotlib import colors\n",
    "from matplotlib import rcParams\n",
    "rcParams['font.sans-serif'] =  'Courier New'\n",
    "rcParams['font.family'] = 'Courier New'\n",
    "rcParams['font.size'] = '12'\n",
    "%matplotlib inline\n",
    "\n",
    "from ipywidgets import IntProgress,Layout\n",
    "\n",
    "import pyspark\n",
    "import pyspark.sql\n",
    "import pyspark.sql.functions as F\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import to_date, floor, lit, rank, col, lag, when, pandas_udf, PandasUDFType, avg, sum as _sum\n",
    "from pyspark.sql.window import Window\n",
    "from pyspark.sql.types import *\n",
    "from pyspark.ml import Pipeline\n",
    "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
    "from pyspark.ml.clustering import KMeans\n",
    "from pyspark.storagelevel import StorageLevel\n",
    "\n",
    "import seaborn as sns\n",
    "from functools import reduce\n",
    "from pandasql import sqldf\n",
    "from itertools import chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import pyhdfs\n",
    "import socket\n",
    "localhost=socket.gethostname()\n",
    "local_ip=socket.gethostbyname(localhost)\n",
    "\n",
    "fs = pyhdfs.HdfsClient(hosts=f'{local_ip}:9870', user_name='sparkuser')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#  fs functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def getexecutor_stat(pdir):\n",
    "    appfolder=fs.list_status(pdir)\n",
    "    total_rchar=0\n",
    "    total_wchar=0\n",
    "    total_read_bytes=0\n",
    "    total_write_bytes=0\n",
    "    total_cancelled_write_bytes=0\n",
    "\n",
    "    for t in appfolder:\n",
    "        if t['type']=='DIRECTORY' and t['pathSuffix']!=\"summary.parquet\":\n",
    "            cdir=pdir+t['pathSuffix']\n",
    "            for cntfile in fs.listdir(cdir):\n",
    "                if cntfile.endswith(\".stat\"):\n",
    "                    with fs.open(cdir+\"/\"+cntfile) as f:\n",
    "                        cnt=f.readlines()\n",
    "                        rchar=0\n",
    "                        wchar=0\n",
    "                        read_bytes=0\n",
    "                        write_bytes=0\n",
    "                        cancelled_write_bytes=0\n",
    "                        for c in cnt:\n",
    "                            c=c.decode('ascii')\n",
    "                            if c.startswith(\"rchar\"):\n",
    "                                v=int(c.split(\" \")[-1])\n",
    "                                rchar=v-rchar\n",
    "                            elif c.startswith(\"wchar\"):\n",
    "                                v=int(c.split(\" \")[-1])\n",
    "                                wchar=v-wchar\n",
    "                            elif c.startswith(\"read_bytes\"):\n",
    "                                v=int(c.split(\" \")[-1])\n",
    "                                read_bytes=v-read_bytes\n",
    "                            elif c.startswith(\"write_bytes\"):\n",
    "                                v=int(c.split(\" \")[-1])\n",
    "                                write_bytes=v-write_bytes\n",
    "                            elif c.startswith(\"cancelled_write_bytes\"):\n",
    "                                v=int(c.split(\" \")[-1])\n",
    "                                cancelled_write_bytes=v-cancelled_write_bytes\n",
    "                        total_rchar+=rchar/1024/1024\n",
    "                        total_wchar+=wchar/1024/1024\n",
    "                        total_read_bytes+=read_bytes/1024/1024\n",
    "                        total_write_bytes+=write_bytes/1024/1024\n",
    "                        total_cancelled_write_bytes+=cancelled_write_bytes/1024/1024\n",
    "    return (total_rchar,total_wchar,total_read_bytes,total_write_bytes,total_cancelled_write_bytes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):\n",
    "    from matplotlib import colors\n",
    "    rng = M - m\n",
    "    norm = colors.Normalize(m - (rng * low),\n",
    "                            M + (rng * high))\n",
    "    normed = norm(s.values)\n",
    "    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]\n",
    "    return ['background-color: {:s}'.format(color) for color in c]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": "true",
    "heading_collapsed": true
   },
   "source": [
    "# base class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class SparkLog_Analysis:\n",
    "    def __init__(self, appid,jobids,clients):\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Analysis:\n",
    "    def __init__(self,file):\n",
    "        self.file=file\n",
    "        self.starttime=0\n",
    "        self.df=None\n",
    "    \n",
    "    def load_data(self):\n",
    "        pass\n",
    "    \n",
    "    def generate_trace_view_list(self,id=0, **kwargs):\n",
    "        if self.df==None:\n",
    "            self.load_data()\n",
    "        trace_events=[]\n",
    "        node=kwargs.get('node',\"node\")\n",
    "        trace_events.append(json.dumps({\"name\": \"process_name\",\"ph\": \"M\",\"pid\":id,\"tid\":0,\"args\":{\"name\":\" \"+node}}))\n",
    "        return trace_events\n",
    "   \n",
    "    def generate_trace_view(self, trace_output, **kwargs):\n",
    "        traces=[]\n",
    "        traces.extend(self.generate_trace_view_list(0,**kwargs))\n",
    "        \n",
    "        output='''\n",
    "        {\n",
    "            \"traceEvents\": [\n",
    "        \n",
    "        ''' + \\\n",
    "        \",\\n\".join(traces)\\\n",
    "       + '''\n",
    "            ],\n",
    "            \"displayTimeUnit\": \"ns\"\n",
    "        }'''\n",
    "\n",
    "        if(\"home\" in trace_output):\n",
    "            outputfolder=trace_output\n",
    "            appidx=trace_output.split(\"/\")[-1]\n",
    "        else:\n",
    "            outputfolder='/home/sparkuser/trace_result/'+trace_output+'.json'\n",
    "            appidx=trace_output\n",
    "        with open(outputfolder, 'w') as outfile: \n",
    "            outfile.write(output)\n",
    "        \n",
    "        traceview_link=f'http://{local_ip}:1088/tracing_examples/trace_viewer.html#/tracing/test_data/{appidx}.json'\n",
    "        display(HTML(f\"<a href={traceview_link}>{traceview_link}</a>\"))\n",
    "        return traceview_link"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# EMON process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_alias_name(metric,func):\n",
    "    return metric+\"_\"+func.__name__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def splits_fill0(x):\n",
    "    fi=[]\n",
    "    for l in x:\n",
    "        li=re.split(r'\\s+',l.strip())\n",
    "        li=[l.replace(\",\",\"\") for l in li]\n",
    "        for j in range(len(li),192*4+5):\n",
    "            li.append('0')\n",
    "        fi.append(li)\n",
    "    return iter(fi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):\n",
    "    from matplotlib import colors\n",
    "    rng = M - m\n",
    "    norm = colors.Normalize(m - (rng * low),\n",
    "                            M + (rng * high))\n",
    "    normed = norm(s.values)\n",
    "    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]\n",
    "    return ['background-color: {:s}'.format(color) for color in c]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Emon_Analysis(Analysis):\n",
    "    def __init__(self,emon_file):\n",
    "        Analysis.__init__(self,emon_file)\n",
    "        \n",
    "        paths=os.path.split(self.file)\n",
    "        if fs.exists(paths[0]+\"/emonv.txt\"):\n",
    "            self.totalcores=0\n",
    "            self.numberofpackages=0\n",
    "            self.coresperpackage=0\n",
    "            self.threadsperpackage=0\n",
    "            self.tsc=0\n",
    "            self.unc_cha_cnt=0\n",
    "            self.unc_mdf_cnt=0\n",
    "            self.unc_imc_cnt=0\n",
    "            self.unc_cxlcm_cnt=0\n",
    "            self.unc_cxldp_cnt=0\n",
    "            self.unc_mchbm_cnt=0\n",
    "            self.unc_m2hbm_cnt=0\n",
    "            self.unc_pmem_fc_cnt=0\n",
    "            self.unc_pmem_mc_cnt=0\n",
    "            self.unc_m2m_cnt=0\n",
    "            self.unc_qpi_cnt=0\n",
    "            self.unc_r3qpi_cnt=0\n",
    "            self.unc_iio_cnt=0\n",
    "            self.unc_irp_cnt=0\n",
    "            self.unc_pcu_cnt=0\n",
    "            self.unc_ubox_cnt=0\n",
    "            self.unc_m2pcie_cnt=0\n",
    "            self.unc_rdt_cnt=0\n",
    "            with fs.open(paths[0]+\"/emonv.txt\") as f:\n",
    "                allcnt = f.read().decode('ascii')\n",
    "            for l in allcnt.split(\"\\n\"):\n",
    "                if l.startswith(\"number_of_online_processors\"):\n",
    "                    self.totalcores=int(re.split(\" +\",l)[2])\n",
    "                elif re.search(\"Number of Packages: +(\\d+)\",l):\n",
    "                    self.numberofpackages=int(re.search(\"Number of Packages: +(\\d+)\",l).group(1))\n",
    "                elif re.search(\"Cores Per Package: +(\\d+)\",l):\n",
    "                    self.coresperpackage=int(re.search(\"Cores Per Package: +(\\d+)\",l).group(1))\n",
    "                elif re.search(\"Threads Per Package: +(\\d+)\",l):\n",
    "                    self.threadsperpackage=int(re.search(\"Threads Per Package: +(\\d+)\",l).group(1))\n",
    "                elif re.search(\"TSC Freq +[.]+ +([0-9.]+)\",l):\n",
    "                    self.tsc=int(float(re.search(\"TSC Freq +[.]+ +([0-9.]+)\",l).group(1))*1000000)\n",
    "                elif l.startswith(\"    cha\"):\n",
    "                    self.unc_cha_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    mdf\"):\n",
    "                    self.unc_mdf_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    imc\"):\n",
    "                    self.unc_imc_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    cxlcm\"):\n",
    "                    self.unc_cxlcm_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    cxldp\"):\n",
    "                    self.unc_cxldp_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    mchbm\"):\n",
    "                    self.unc_mchbm_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    m2hbm\"):\n",
    "                    self.unc_m2hbm_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    pmem_fc\"):\n",
    "                    self.unc_pmem_fc_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    pmem_mc\"):\n",
    "                    self.unc_pmem_mc_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    m2m\"):\n",
    "                    self.unc_m2m_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    qpi\"):\n",
    "                    self.unc_qpi_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    r3qpi\"):\n",
    "                    self.unc_r3qpi_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    iio\"):\n",
    "                    self.unc_iio_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    irp\"):\n",
    "                    self.unc_irp_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    pcu\"):\n",
    "                    self.unc_pcu_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    ubox\"):\n",
    "                    self.unc_ubox_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    m2pcie\"):\n",
    "                    self.unc_m2pcie_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "                elif l.startswith(\"    rdt\"):\n",
    "                    self.unc_rdt_cnt=int(re.split(\" +\",l)[-1])*2\n",
    "        else:\n",
    "            raise Exception(\"Wrong, no emonv specified\")\n",
    "            \n",
    "        self.begin_clk=0\n",
    "        self.end_clk=0\n",
    "            \n",
    "        self.corecnt=self.totalcores\n",
    "        \n",
    "        self.emon_metrics=collections.OrderedDict({\n",
    "            'emon_cpuutil':{\n",
    "                'sum_func':self.cores_sum,   \n",
    "                'events':{\n",
    "                    'a':'CPU_CLK_UNHALTED.REF_TSC'\n",
    "                },\n",
    "                'formula':{\n",
    "                    'cpu%':'a/({:f}*{:d})'.format(self.tsc,self.corecnt)\n",
    "                },\n",
    "                'fmt':lambda l: F.round(l, 3)\n",
    "            },\n",
    "            'emon_cpufreq':{\n",
    "                'sum_func':self.cores_sum,   \n",
    "                'events':{\n",
    "                    'a':'CPU_CLK_UNHALTED.THREAD',\n",
    "                    'b':'CPU_CLK_UNHALTED.REF_TSC'\n",
    "                },\n",
    "                'formula':{\n",
    "                    'cpu freq':'a/b*{:f}'.format(self.tsc/1000000)\n",
    "                },\n",
    "                'fmt':lambda l: F.round(l, 3)\n",
    "            },\n",
    "            'emon_instr_retired':{\n",
    "                'sum_func':self.cores_sum,   \n",
    "                'events':{\n",
    "                    'a':'INST_RETIRED.ANY'\n",
    "                },\n",
    "                'formula':{\n",
    "                    'pathlength':'a/1000000000'\n",
    "                },\n",
    "                'fmt':lambda l: F.round(l, 0)\n",
    "            },\n",
    "            'emon_ipc':{\n",
    "                'sum_func':self.cores_sum,   \n",
    "                'events':{\n",
    "                    'a':'CPU_CLK_UNHALTED.THREAD',\n",
    "                    'b':'INST_RETIRED.ANY'\n",
    "                },\n",
    "                'formula':{\n",
    "                    'ipc':'b/a'\n",
    "                },\n",
    "                'fmt':lambda l: F.round(l, 3)\n",
    "            }\n",
    "        })\n",
    "        self.effective_metric=None\n",
    "        self.appclients=[] # there is no appid and client column\n",
    "\n",
    "    def count_sum(self,collected_cores):\n",
    "        return F.expr('+'.join(['_{:d}/_2*{:d}'.format(c+3,self.tsc) for c in collected_cores]))\n",
    "\n",
    "    def cores_sum(self,collected_cores):\n",
    "        return self.count_sum(collected_cores)\n",
    "\n",
    "    def mem_sum(self,collected_cores):\n",
    "        return self.count_sum(collected_cores)\n",
    "\n",
    "    def pcie_sum(self,collected_cores):\n",
    "        return self.count_sum([2,3,7,8])\n",
    "        \n",
    "    def list_metric(self):\n",
    "        if self.effective_metric is None:\n",
    "            self.get_effective_metric()\n",
    "        for k in self.effective_metric:\n",
    "            m=self.emon_metrics[k]\n",
    "            print(k)\n",
    "            for fk,fm in m['formula'].items():\n",
    "                print(\"    \",fk)\n",
    "            \n",
    "    def load_data(self):\n",
    "        paths=os.path.split(self.file)\n",
    "        if fs.exists(paths[0]+\"/emon.parquet/_SUCCESS\"):\n",
    "            self.df=spark.read.parquet(paths[0]+\"/emon.parquet\")\n",
    "            self.df.cache()\n",
    "            return\n",
    "        \n",
    "        emondata=sc.textFile(self.file)\n",
    "        emondf=emondata.mapPartitions(splits_fill0).toDF()\n",
    "        emondf=emondf.withColumn(\"id\", F.monotonically_increasing_id())\n",
    "        giddf=emondf.where(emondf._1.rlike(\"======\")).selectExpr(\"id as g_id\")\n",
    "        \n",
    "        iddf=emondf.where(emondf._1.rlike(\"\\d\\d/\")).selectExpr(\"_1 as r_1\",\"_2 as r_2\",\"id as r_id\")\n",
    "        jfid=emondf.where(emondf._2.rlike(\"^[1-9][0-9][0-9]+\")).join(iddf,on=[emondf.id>iddf.r_id]).groupBy('id').agg(F.max('r_id').alias('r_id'))\n",
    "        iddf=iddf.join(jfid,on='r_id',how='left')\n",
    "        emondf=emondf.where(emondf._2.rlike(\"^[1-9][0-9][0-9]+\")).join(iddf,on='id',how='left')\n",
    "        \n",
    "        jfid=emondf.join(giddf,on=[emondf.id>giddf.g_id]).groupBy('id').agg(F.max('g_id').alias('g_id'))\n",
    "        giddf=giddf.join(jfid,on='g_id',how='left')\n",
    "        emondf=emondf.join(giddf,on='id',how='inner')\n",
    "        \n",
    "        df=emondf\n",
    "\n",
    "        select_list = []\n",
    "        for idx, c in enumerate(df.columns):\n",
    "            if idx >= 2 and c.startswith('_'):\n",
    "                select_list.append(col(c).cast(LongType()).alias(c))\n",
    "            else:\n",
    "                select_list.append(col(c))\n",
    "        df=df.select(select_list)\n",
    "\n",
    "        df=df.withColumn(\"timestamp\",F.unix_timestamp(F.concat_ws(' ','r_1','r_2'),'MM/dd/yyyy HH:mm:ss')*F.lit(1000)+(F.split(F.col('r_2'),'\\.')[1]).astype(IntegerType()))\n",
    "        df=df.drop(\"r_1\")\n",
    "        df=df.drop(\"r_2\")\n",
    "        \n",
    "        cores=list(range(0,self.totalcores))\n",
    "        df=df.withColumn('sum',\n",
    "                         F.when(F.col(\"_1\").startswith(\"UNC_IIO\"),self.pcie_sum(cores))\n",
    "                         .otherwise(self.cores_sum(cores)))\n",
    "        if self.begin_clk>0 and self.end_clk>0:\n",
    "            df=df.withColumn('valid',((F.col(\"timestamp\")>F.lit(self.begin_clk)) & (F.col(\"timestamp\")<F.lit(self.end_clk))))\n",
    "        else:\n",
    "            df=df.withColumn('valid',F.lit(True))\n",
    "        \n",
    "        df.repartition(3).write.mode(\"overwrite\").parquet(paths[0]+\"/emon.parquet\")\n",
    "        self.df=df\n",
    "        df.cache()\n",
    "        \n",
    "    def get_effective_metric(self):\n",
    "        if self.df==None:\n",
    "            self.load_data()\n",
    "\n",
    "        emondf=self.df\n",
    "        gid=emondf.agg(F.min('g_id')).collect()[0]['min(g_id)']\n",
    "        emondf=emondf.where(F.col(\"g_id\")==gid)\n",
    "        emondf=emondf.cache()\n",
    "\n",
    "        effective_metric=[]\n",
    "\n",
    "        progress = IntProgress(layout=Layout(width='80%', height='40px'))\n",
    "        progress.max = len(self.emon_metrics)\n",
    "        progress.description = 'Calculate Effective Metrics'\n",
    "        display(progress)\n",
    "        progress.value=0\n",
    "\n",
    "        for k,m in self.emon_metrics.items():\n",
    "            join_df=None\n",
    "            progress.value=progress.value+1\n",
    "            for alias,event in m['events'].items():\n",
    "                if join_df is None:\n",
    "                    join_df=emondf.where(\"_1='{:s}'\".format(event)).select('r_id','g_id')\n",
    "                else:\n",
    "                    tdf=emondf.where(\"_1='{:s}'\".format(event)).select('r_id','g_id')\n",
    "                    join_dft=join_df.join(tdf.drop('g_id'),on='r_id',how='inner')\n",
    "                    if join_dft.count()==0:\n",
    "                        join_df=join_df.join(tdf.drop('r_id'),on='g_id',how='inner')\n",
    "                    else:\n",
    "                        join_df=join_dft\n",
    "            if join_df.count()>0:\n",
    "                effective_metric.append(k)\n",
    "        progress.value=progress.value+1\n",
    "        self.effective_metric=effective_metric\n",
    "        emondf.unpersist()\n",
    "    \n",
    "    def gen_metric(self,emondf, m):\n",
    "        join_df=None\n",
    "        for alias,event in m['events'].items():\n",
    "            if join_df is None:\n",
    "                join_df=emondf.where(\"_1='{:s}'\".format(event)).select('timestamp','_1','_2','r_id','g_id',*self.appclients,F.col('sum').alias(alias))\n",
    "            else:\n",
    "                tdf=emondf.where(\"_1='{:s}'\".format(event)).select('_1','_2','r_id','g_id',*self.appclients,F.col('sum').alias(alias))\n",
    "                join_dft=join_df.join(tdf.drop('g_id'),on=['r_id',*self.appclients],how='inner')\n",
    "                if join_dft.count()==0:\n",
    "                    join_df=join_df.join(tdf.drop('r_id'),on=['g_id',*self.appclients],how='inner')\n",
    "                else:\n",
    "                    join_df=join_dft\n",
    "        return join_df\n",
    "\n",
    "    \n",
    "    \n",
    "    def generate_trace_view_list(self,id=0, **kwargs):\n",
    "        trace_events=Analysis.generate_trace_view_list(self,id, **kwargs)\n",
    "        \n",
    "        cores=list(range(0,self.totalcores))\n",
    "        \n",
    "        emondf=self.df\n",
    "        if 'collected_cores' in kwargs:\n",
    "            cores=kwargs.get(\"collected_cores\",None)\n",
    "            emondf=emondf.withColumn('sum',\n",
    "                     F.when(F.col(\"_1\").startswith(\"UNC_IIO\"),self.pcie_sum(cores))\n",
    "                     .otherwise(self.cores_sum(cores)))\n",
    "        show_metric= kwargs.get('show_metric', None)\n",
    "            \n",
    "        if show_metric is None and self.effective_metric is None:\n",
    "            self.get_effective_metric()\n",
    "\n",
    "        self.effective_metric=show_metric if show_metric is not None else self.effective_metric\n",
    "        \n",
    "        emondf=self.df\n",
    "        \n",
    "        tid=0\n",
    "        for k in self.effective_metric:\n",
    "            m=self.emon_metrics[k]\n",
    "            join_df=self.gen_metric(emondf,m)\n",
    "            rstdf=join_df.select(\n",
    "                            F.lit(tid).alias('tid'),\n",
    "                            F.lit(id).alias('pid'),\n",
    "                            F.lit('C').alias('ph'),\n",
    "                            F.lit(k).alias('name'),\n",
    "                            (F.col('timestamp')-F.lit(self.starttime)).alias(\"ts\"),\n",
    "                            F.struct(*[m['fmt'](F.expr(formula)).alias(col_name) for col_name,formula in m['formula'].items() ]).alias('args')\n",
    "            ).where(F.col(\"ts\").isNotNull()).orderBy('ts')\n",
    "            trace_events.extend(rstdf.toJSON().collect())\n",
    "            trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":tid,\"args\":{\"sort_index \":tid}}))\n",
    "            tid=tid+1        \n",
    "\n",
    "        return trace_events\n",
    "    \n",
    "    def show_emon_metric(self,metric,sub_metric,core,draw=True,metric_define=None, **kwargs):\n",
    "        if self.df==None:\n",
    "            self.load_data()\n",
    "        emondf=self.df\n",
    "        \n",
    "        showalltime=kwargs.get(\"showalltime\",True)\n",
    "        \n",
    "        if not showalltime:\n",
    "            emondf=emondf.filter(F.col(\"valid\")==F.lit(True))\n",
    "        \n",
    "        if metric is None or metric=='':\n",
    "            for k in self.effective_metric:\n",
    "                m=self.emon_metrics[k]\n",
    "                if sub_metric in m['formula']:\n",
    "                    break\n",
    "            else:\n",
    "                print(\"can't find metric\",sub_metric)\n",
    "                return        \n",
    "        else:\n",
    "            k=metric\n",
    "        if metric_define is None:\n",
    "            m= self.emon_metrics[k]\n",
    "        else:\n",
    "            m= metric_define[k]\n",
    "\n",
    "        if type(core)==int:\n",
    "            core=[core,]\n",
    "        emondf=emondf.withColumn('sum',\n",
    "                 F.when(F.col(\"_1\").startswith(\"UNC_IIO\"),self.pcie_sum(core))\n",
    "                 .otherwise(self.count_sum(core)))\n",
    "            \n",
    "        join_df=self.gen_metric(emondf,m)\n",
    "        \n",
    "        rstdf=join_df.select(\n",
    "                    F.col('timestamp').alias('ts'),\n",
    "                    m['fmt'](F.expr(m['formula'][sub_metric])).alias(sub_metric),\n",
    "                    'r_id'\n",
    "        ).where(F.col(\"timestamp\").isNotNull()).orderBy('timestamp')\n",
    "        \n",
    "        metric_sum=rstdf.select(sub_metric).summary().toPandas()\n",
    "        display(metric_sum)\n",
    "        \n",
    "        if draw:\n",
    "            pddf=rstdf.toPandas()\n",
    "            pddf['ts']=(pddf['ts']-pddf.loc[0,'ts'])/1000\n",
    "            fig, axs = plt.subplots(nrows=1, ncols=2, sharey=True,figsize=(30,8),gridspec_kw = {'width_ratios':[1, 5]})\n",
    "            plt.subplots_adjust(wspace=0.01)\n",
    "            sns.violinplot(y=sub_metric, data=pddf, ax=axs[0],palette=['g'])\n",
    "            axs[0].yaxis.grid(True, which='major')\n",
    "            ax=axs[1]\n",
    "            ax.stackplot(pddf['ts'], pddf[sub_metric],colors=['bisque'])\n",
    "            #ymin, ymax = ax.get_ylim()\n",
    "            ax2 = ax.twinx()\n",
    "            ax2.set_ylim(ax.get_ylim())\n",
    "            ax2.axhline(y=float(metric_sum.loc[4,sub_metric]), linewidth=2, color='r')\n",
    "            ax2.axhline(y=float(metric_sum.loc[5,sub_metric]), linewidth=2, color='r')\n",
    "            ax2.axhline(y=float(metric_sum.loc[6,sub_metric]), linewidth=2, color='r')\n",
    "            ax2.axhline(y=float(metric_sum.loc[7,sub_metric]), linewidth=2, color='r')\n",
    "            ax.set_xlabel('time (s)')\n",
    "            ax.yaxis.grid(True, which='major')\n",
    "            plt.show()\n",
    "            \n",
    "            hist_elapsedtime=rstdf.select('`{:s}`'.format(sub_metric)).rdd.flatMap(lambda x: x).histogram(15)\n",
    "            fig, axs = plt.subplots(figsize=(30, 5))\n",
    "            ax=axs\n",
    "            binSides, binCounts = hist_elapsedtime\n",
    "            binSides=[builtins.round(l,2) for l in binSides]\n",
    "\n",
    "            N = len(binCounts)\n",
    "            ind = numpy.arange(N)\n",
    "            width = 0.5\n",
    "\n",
    "            rects1 = ax.bar(ind+0.5, binCounts, width, color='b')\n",
    "\n",
    "            ax.set_ylabel('Frequencies')\n",
    "            ax.set_title(sub_metric)\n",
    "            ax.set_xticks(numpy.arange(N+1))\n",
    "            ax.set_xticklabels(binSides)\n",
    "        return rstdf\n",
    "        \n",
    "\n",
    "    def gen_reduce_metric(self,metric,core,sub_metric,agg_func):\n",
    "        if self.df==None:\n",
    "            self.load_data()\n",
    "        emondf=self.df\n",
    "        \n",
    "        emondf=emondf.where(F.col(\"valid\")==F.lit(True))\n",
    "        \n",
    "        k=metric\n",
    "        m= self.emon_metrics[k]\n",
    "\n",
    "        if type(core)==int:\n",
    "            core=[core,]\n",
    "        \n",
    "        if len(core)<self.totalcores:\n",
    "            emondf=emondf.withColumn('sum',\n",
    "                     F.when(F.col(\"_1\").startswith(\"UNC_IIO\"),self.pcie_sum(core))\n",
    "                     .otherwise(self.count_sum(core)))\n",
    "\n",
    "        join_df=self.gen_metric(emondf,m)\n",
    "        \n",
    "        rstdf=join_df.select(\n",
    "                *self.appclients,\n",
    "                m['fmt'](F.expr(m['formula'][sub_metric])).alias(sub_metric)\n",
    "        ).where(F.col(\"timestamp\").isNotNull())\n",
    "        return rstdf\n",
    "    \n",
    "    def get_reduce_metric(self,metric,core,sub_metric,agg_func):\n",
    "        rstdf=self.gen_reduce_metric(metric,core,sub_metric,agg_func)\n",
    "        return rstdf.agg(*[l(\"`{:s}`\".format(sub_metric)).alias(get_alias_name(sub_metric,l)) for l in agg_func]).toPandas()\n",
    "   \n",
    "    def get_reduce_metrics(self,core=None,agg_func=[F.max,F.mean,F.min,F.sum]):\n",
    "        coldf=None\n",
    "        if self.effective_metric is None:\n",
    "            self.get_effective_metric()\n",
    "\n",
    "        if core is None:\n",
    "            core=list(range(0,self.totalcores))\n",
    "        progress = IntProgress(layout=Layout(width='80%', height='40px'))\n",
    "        progress.max = len(self.effective_metric)\n",
    "        progress.description = 'Calculate Effective Metrics'\n",
    "        display(progress)\n",
    "        progress.value=0\n",
    "        \n",
    "        columns=[f.__name__ for f in agg_func]\n",
    "            \n",
    "        for k in self.effective_metric:\n",
    "            progress.value=progress.value+1\n",
    "            m=self.emon_metrics[k]\n",
    "            for fk,fm in m['formula'].items():\n",
    "                df=self.get_reduce_metric(k,core,fk,agg_func)\n",
    "                df.columns=columns\n",
    "                df.index=[fk]\n",
    "                if coldf is None:\n",
    "                    coldf=df\n",
    "                else:\n",
    "                    coldf=coldf.append(df)\n",
    "        progress.value=progress.value+1\n",
    "        return coldf\n",
    "    \n",
    "\n",
    "class Emon_Analysis_All(Emon_Analysis):\n",
    "    def __init__(self,emon_files):\n",
    "        Emon_Analysis.__init__(self\n",
    "                               ,emon_files[0])\n",
    "        self.emon_files=emon_files\n",
    "        self.appclients=['appid','client']\n",
    "        \n",
    "    def load_data(self):\n",
    "        spark.clearCache()\n",
    "        emondf=spark.read.format(\"parquet\").load(self.emon_files)\n",
    "        emondf=emondf.withColumn(\"file\",F.input_file_name())\n",
    "        filepath=emondf.select(F.col(\"file\")).limit(1).collect()[0]['file']\n",
    "        length=len(filepath.split(\"/\"))\n",
    "        emondf=emondf.withColumn(\"appid\",F.split(\"file\",\"/\")[length-4])\n",
    "        emondf=emondf.withColumn(\"client\",F.split(\"file\",\"/\")[length-3]).drop(\"file\")\n",
    "        emondf=emondf.cache()\n",
    "        self.df=emondf\n",
    "        \n",
    "    def get_reduce_metric(self,metric,core=None,sub_metric=None,agg_func=[F.max,F.mean,F.min,F.sum]):\n",
    "        \n",
    "        if core is None:\n",
    "            core=list(range(0,self.totalcores))\n",
    "        if sub_metric is None:\n",
    "            m=self.emon_metrics[metric]\n",
    "            sub_metric = list(m['formula'].keys())[0]\n",
    "        \n",
    "        rstdf=self.gen_reduce_metric(metric,core,sub_metric,agg_func)\n",
    "        return rstdf.groupBy(\"appid\").agg(*[l(\"`{:s}`\".format(sub_metric)).alias(get_alias_name(sub_metric,l)) for l in agg_func]).toPandas()\n",
    "    \n",
    "    def get_reduce_metrics(self,core=None,agg_func=[F.max,F.mean,F.min,F.sum]):\n",
    "        return None\n",
    "    \n",
    "    def generate_trace_view_list(self, id , **kwargs):\n",
    "        Analysis.generate_trace_view_list(self,0)\n",
    "        \n",
    "        cores=list(range(0,self.totalcores))\n",
    "        \n",
    "        pidmap=kwargs.get(\"pidmap\",None)\n",
    "        if pidmap is None:\n",
    "            print(\"multiple emon process needs pidmap in {'client':pid,} format\")\n",
    "            return []\n",
    "        else:\n",
    "            display(pidmap)\n",
    "            \n",
    "        emondf=self.df\n",
    "        if 'collected_cores' in kwargs:\n",
    "            cores=kwargs.get(\"collected_cores\",None)\n",
    "            emondf=emondf.withColumn('sum',\n",
    "                     F.when(F.col(\"_1\").startswith(\"UNC_IIO\"),self.pcie_sum(cores))\n",
    "                     .otherwise(self.cores_sum(cores)))\n",
    "        show_metric= kwargs.get('show_metric', None)\n",
    "            \n",
    "        if show_metric is None and self.effective_metric is None:\n",
    "            self.get_effective_metric()\n",
    "\n",
    "        self.effective_metric=show_metric if show_metric is not None else self.effective_metric\n",
    "        \n",
    "        mapexpr=F.create_map([F.lit(x) for x in chain(*pidmap.items())])\n",
    "        \n",
    "        trace_events=[]\n",
    "        for c,id in pidmap.items():\n",
    "            trace_events.append(json.dumps({\"name\": \"process_name\",\"ph\": \"M\",\"pid\":id,\"tid\":0,\"args\":{\"name\":\" \"+c}}))\n",
    "        tid=0\n",
    "        for k in self.effective_metric:\n",
    "            m=self.emon_metrics[k]\n",
    "            join_df=self.gen_metric(emondf,m)\n",
    "            join_df=join_df.withColumn('pid',mapexpr.getItem(F.col(\"client\")))\n",
    "            rstdf=join_df.select(\n",
    "                            F.lit(tid).alias('tid'),\n",
    "                            F.col('pid').alias('pid'),\n",
    "                            F.lit('C').alias('ph'),\n",
    "                            F.lit(k).alias('name'),\n",
    "                            (F.col('timestamp')-F.lit(self.starttime)).alias(\"ts\"),\n",
    "                            F.struct(*[m['fmt'](F.expr(formula)).alias(col_name) for col_name,formula in m['formula'].items() ]).alias('args')\n",
    "            ).where(F.col(\"ts\").isNotNull()).orderBy('ts')\n",
    "            trace_events.extend(rstdf.toJSON().collect())\n",
    "            for id in pidmap.values():\n",
    "                trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":tid,\"args\":{\"sort_index \":tid}}))\n",
    "            tid=tid+1\n",
    "        return trace_events    \n",
    "    \n",
    "    \n",
    "def get_emon_parquets(apps,basedir):\n",
    "    emondfunion=None\n",
    "    emondfs=[]\n",
    "    for appid in apps:\n",
    "        slaves=fs.list_status(\"/\"+basedir+\"/\"+appid)\n",
    "        slaves=[f['pathSuffix'] for f in slaves if f['type']=='DIRECTORY' and f['pathSuffix']!=\"summary.parquet\"]\n",
    "        for client in slaves:\n",
    "            if not fs.exists(f\"/{basedir}/{appid}/{client}/emon.parquet\"):\n",
    "                print(f\"/{basedir}/{appid}/{client}/emon.parquet is not found, trying to load data ...\")\n",
    "                emonals=Emon_Analysis(f\"/{basedir}/{appid}/{client}/emon.rst\")\n",
    "                emonals.load_data()\n",
    "            emondfs.append(f\"/{basedir}/{appid}/{client}/emon.parquet\")\n",
    "    return emondfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# app log analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_his_perf(namelike,currentdir):\n",
    "    dird=fs.listdir(\"/gluten\")\n",
    "    apps=[]\n",
    "    for l in dird:\n",
    "        if l.startswith(\"2\") and l>(date.today() - timedelta(days=60)).strftime(\"%Y_%m_%d\"):\n",
    "            for r in fs.listdir(\"/gluten/\"+l):\n",
    "                if fs.exists(\"/gluten/\"+l+\"/\"+r+\"/app.log\"):\n",
    "                    apps.append(\"/gluten/\"+l+\"/\"+r+\"/app.log\")\n",
    "    if currentdir not in apps:\n",
    "        apps.append(currentdir)\n",
    "    appdf=spark.read.json(apps)\n",
    "    appdf=appdf.withColumn(\"filename\", F.input_file_name())\n",
    "    starttime=appdf.where(\"Properties.`spark.app.name` like '\"+namelike+\"%' and Event='SparkListenerJobStart'\").select(\"filename\",F.col('Properties.`spark.app.name`').alias(\"appname\"),F.col('Submission Time').alias(\"starttime\"))\n",
    "    finishtime=appdf.where(\"Event='SparkListenerJobEnd'\").select(\"filename\",F.col('Completion Time').alias(\"finishtime\"))\n",
    "    starttime=starttime.groupBy(\"filename\").agg(F.max(\"appname\").alias(\"appname\"),F.min(\"starttime\").alias(\"starttime\"))\n",
    "    finishtime=finishtime.groupBy(\"filename\").agg(F.max(\"finishtime\").alias(\"finishtime\"))\n",
    "    elapsedtime=starttime.join(finishtime,\"filename\").orderBy(\"starttime\").select(F.date_format(F.from_unixtime(F.col('starttime')/1000),\"yyyy_MM_dd\").alias(\"test_date\"),(F.col(\"finishtime\")/1000-F.col(\"starttime\")/1000).alias(\"elapsedtime\"))\n",
    "    epsdf=elapsedtime.toPandas()\n",
    "    epsdf.plot(x='test_date',y=['elapsedtime'],style=\"-*\",figsize=(30,8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import udf\n",
    "@udf(\"long\")\n",
    "def isfinish_udf(s):\n",
    "    import json\n",
    "    s=json.loads(s)\n",
    "    def isfinish(root):\n",
    "        if \"isFinalPlan=false\" in root['simpleString'] or root['children'] is None:\n",
    "            return 0\n",
    "        for c in root[\"children\"]:\n",
    "            if isfinish(c)==0:\n",
    "                return 0\n",
    "        return 1\n",
    "    if len(s)>0:\n",
    "        return isfinish(s[0])\n",
    "    else:\n",
    "        return 0\n",
    "    \n",
    "@pandas_udf(\"taskid long, start long, dur long, name string\", PandasUDFType.GROUPED_MAP)\n",
    "def time_breakdown(pdf):\n",
    "    ltime=pdf['Launch Time'][0]+2\n",
    "    pdf['start']=0\n",
    "    pdf['dur']=0\n",
    "    outpdf=[]\n",
    "    ratio=(pdf[\"Finish Time\"][0]-pdf[\"Launch Time\"][0])/pdf[\"Update\"].sum()\n",
    "    ratio=1 if ratio>1 else ratio\n",
    "    for idx,l in pdf.iterrows():\n",
    "        if(l[\"Update\"]*ratio>1):\n",
    "            outpdf.append([l[\"Task ID\"],ltime,int(l[\"Update\"]*ratio),l[\"mname\"]])\n",
    "            ltime=ltime+int(l[\"Update\"]*ratio)\n",
    "    if len(outpdf)>0:\n",
    "        return pandas.DataFrame(outpdf)\n",
    "    else:\n",
    "        return pandas.DataFrame({'taskid': pandas.Series([], dtype='long'),\n",
    "                   'start': pandas.Series([], dtype='long'),\n",
    "                   'dur': pandas.Series([], dtype='long'),\n",
    "                   'name': pandas.Series([], dtype='str'),\n",
    "                                })\n",
    "    \n",
    "class App_Log_Analysis(Analysis):\n",
    "    def __init__(self, file, jobids):\n",
    "        Analysis.__init__(self,file)\n",
    "        self.jobids=[] if jobids is None else [str(l) for l in jobids]\n",
    "        self.df=None\n",
    "        self.pids=[]\n",
    "        \n",
    "    def load_data(self):\n",
    "        print(\"load data \", self.file)\n",
    "        jobids=self.jobids\n",
    "        df=spark.read.json(self.file)\n",
    "        \n",
    "        if 'App ID' in df.columns:\n",
    "            self.appid=df.where(\"`App ID` is not null\").collect()[0][\"App ID\"]\n",
    "        else:\n",
    "            self.appid=\"Application-00000000\"\n",
    "                \n",
    "        if df.where(\"Event='org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates'\").count()>0:\n",
    "            self.dfacc=df.where(\"Event='org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates'\").select(F.col(\"executionId\").alias(\"queryid\"),F.explode(\"accumUpdates\"))\n",
    "        else:\n",
    "            self.dfacc = None\n",
    "            \n",
    "        if \"sparkPlanInfo\" in df.columns:\n",
    "            self.queryplans=df.where(\"(Event='org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart' or Event='org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate') \\\n",
    "                                  and (sparkPlanInfo.nodeName!='AdaptiveSparkPlan' or sparkPlanInfo.simpleString='AdaptiveSparkPlan isFinalPlan=true') \").select(F.col(\"executionId\").alias(\"queryid\"),'physicalPlanDescription',\"sparkPlanInfo.*\")\n",
    "        else:\n",
    "            self.queryplans=None\n",
    "        \n",
    "        seen = set()\n",
    "        \n",
    "        if self.queryplans is not None:\n",
    "            self.queryplans=self.queryplans.where(isfinish_udf(F.to_json(\"children\"))==1)\n",
    "        \n",
    "            self.allmetrics=[]\n",
    "            if self.queryplans.count() > 0:\n",
    "                metrics=self.queryplans.collect()\n",
    "                def get_metric(root):\n",
    "                    for l in root[\"metrics\"]:\n",
    "                        if l['accumulatorId'] not in seen:\n",
    "                            seen.add(l['accumulatorId'])\n",
    "                            self.allmetrics.append([l['accumulatorId'],l[\"metricType\"],l['name'],root[\"nodeName\"]])\n",
    "                    if root['children'] is not None:\n",
    "                        for c in root[\"children\"]:\n",
    "                            get_metric(c)\n",
    "                for c in metrics:\n",
    "                    get_metric(c)\n",
    "        \n",
    "            amsdf=spark.createDataFrame(self.allmetrics)\n",
    "            amsdf=amsdf.withColumnRenamed(\"_1\",\"ID\").withColumnRenamed(\"_2\",\"type\").withColumnRenamed(\"_3\",\"Name\").withColumnRenamed(\"_4\",\"nodeName\")\n",
    "        \n",
    "        \n",
    "        if self.dfacc is not None:\n",
    "            self.dfacc=self.dfacc.select(\"queryid\",(F.col(\"col\")[0]).alias(\"ID\"),(F.col(\"col\")[1]).alias(\"Update\")).join(amsdf,on=[\"ID\"])\n",
    "        \n",
    "        if self.queryplans is not None:\n",
    "            self.metricscollect=[l for l in self.allmetrics if l[1] in ['nsTiming','timing'] and (l[2].startswith(\"time to\") or l[2].startswith(\"time of\") or l[2].startswith(\"scan time\") or l[2].startswith(\"shuffle write time\") or l[2].startswith(\"time to spill\") or l[2].startswith(\"task commit time\")) \n",
    "                                 and l[2] not in(\"time to collect batch\", \"time of scan\") ]\n",
    "        \n",
    "        #config=df.where(\"event='SparkListenerJobStart' and Properties.`spark.executor.cores` is not null\").select(\"Properties.*\").limit(1).collect()\n",
    "        config=df.select(\"`Spark Properties`.*\").where(\"`spark.app.id` is not null\").limit(1).collect()\n",
    "    \n",
    "        configdic=config[0].asDict()\n",
    "        self.parallelism=int(configdic['spark.sql.shuffle.partitions']) if 'spark.sql.shuffle.partitions' in configdic else 1\n",
    "        self.executor_cores=int(configdic['spark.executor.cores']) if 'spark.executor.cores' in configdic else 1\n",
    "        self.executor_instances=int(configdic['spark.executor.instances']) if 'spark.executor.instances' in configdic else 1\n",
    "        self.taskcpus= int(configdic['spark.task.cpus'])if 'spark.task.cpus' in configdic else 1\n",
    "        self.batchsize= int(configdic['spark.gluten.sql.columnar.maxBatchSize'])if 'spark.gluten.sql.columnar.maxBatchSize' in configdic else 4096\n",
    "        \n",
    "        self.realexecutors = df.where(~F.isnull(F.col(\"Executor ID\"))).select(\"Executor ID\").distinct().count()\n",
    "        \n",
    "        execstart = df.where(\"Event='org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart'\").select(\"executionId\",\"time\")\n",
    "        execend = df.where(\"Event='org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd'\").select(\"executionId\",\"time\")\n",
    "        execstart=execstart.withColumnRenamed(\"time\",\"query_starttime\").withColumnRenamed(\"executionId\",\"queryid\")\n",
    "        execend=execend.withColumnRenamed(\"time\",\"query_endtime\").withColumnRenamed(\"executionId\",\"queryid\")\n",
    "        exectime = execstart.join(execend,on=[\"queryid\"])\n",
    "\n",
    "        if \"spark.sql.execution.id\" in df.where(\"Event='SparkListenerJobStart'\").select(\"Properties.*\").columns:\n",
    "            df_jobstart=df.where(\"Event='SparkListenerJobStart'\").select(\"Job ID\",\"Submission Time\",F.col(\"Properties.`spark.sql.execution.id`\").alias(\"queryid\"),\"Stage IDs\")\n",
    "        else:\n",
    "            df_jobstart=df.where(\"Event='SparkListenerJobStart'\").select(\"Job ID\",\"Submission Time\",F.lit(0).alias(\"queryid\"),\"Stage IDs\")\n",
    "        \n",
    "        df_jobend=df.where(\"Event='SparkListenerJobEnd'\").select(\"`Job ID`\",\"Completion Time\")\n",
    "        df_job=df_jobstart.join(df_jobend,\"Job ID\")\n",
    "        df_job=df_job.withColumnRenamed(\"Submission Time\",\"job_start_time\")\n",
    "        df_job=df_job.withColumnRenamed(\"Completion Time\",\"job_stop_time\")\n",
    "        self.df_job=df_job\n",
    "        \n",
    "        jobstage=df_job.select(\"*\",F.explode(\"Stage IDs\").alias(\"Stage ID\"))\n",
    "        task=df.where(\"(Event='SparkListenerTaskEnd' or Event='SparkListenerTaskStart') \").select(\"Event\",\"Stage ID\",\"task info.*\",\"task metrics.*\")\n",
    "        \n",
    "        self.failed_stages = [str(l['Stage ID']) for l in task.where(\"Failed='true'\").select(\"Stage ID\").distinct().collect()]\n",
    "        \n",
    "        self.speculativetask = task.where(\"speculative = 'true'\").count()\n",
    "        self.speculativekilledtask = task.where(\"speculative = true and killed='true'\").count()\n",
    "        self.speculativestage = task.where(\"speculative = true and killed='true'\").select(\"`Stage ID`\").distinct().count()\n",
    "        \n",
    "        validtsk = task.where(\"Event = 'SparkListenerTaskEnd' and (Failed<>'true' or killed<>'true')\").select(\"`Task ID`\")\n",
    "        task=task.join(validtsk,on='Task ID',how='inner')\n",
    "        \n",
    "        taskjob=task.\\\n",
    "            select(\"Host\",\"`Event`\",\"`Launch Time`\",\"`Executor ID`\",\"`Task ID`\",\"`Finish Time`\",\n",
    "                    \"`Stage ID`\",\"`Input Metrics`.`Bytes Read`\",\"`Disk Bytes Spilled`\",\"`Memory Bytes Spilled`\",\"`Shuffle Read Metrics`.`Local Bytes Read`\",\"`Shuffle Read Metrics`.`Remote Bytes Read`\",\n",
    "                   \"`Shuffle Write Metrics`.`Shuffle Bytes Written`\",\"`Executor Deserialize Time`\",\"`Shuffle Read Metrics`.`Fetch Wait Time`\",\"`Executor Run Time`\",\"`Shuffle Write Metrics`.`Shuffle Write Time`\",\n",
    "                   \"`Result Serialization Time`\",\"`Getting Result Time`\",\"`JVM GC Time`\",\"`Executor CPU Time`\",\"Accumulables\",\"Peak Execution Memory\",\n",
    "                    F.when(task['Finish Time']==0,task['Launch Time']).otherwise(task['Finish Time']).alias('eventtime')\n",
    "        ).join(jobstage,\"Stage ID\").where(\"`Finish Time` is null or `Finish Time` <=job_stop_time+5\")\n",
    "        \n",
    "        taskjob = taskjob.join(exectime,on=['queryid'],how='left')\n",
    "        \n",
    "        self.df=taskjob\n",
    "        \n",
    "        if len(jobids)>0:\n",
    "            self.df=self.df.where('`Job ID` in ({:s})'.format(','.join(jobids)))\n",
    "        \n",
    "        queryids=self.df.select(F.col(\"queryid\").astype(IntegerType())).distinct().where(\"queryid is not null\").orderBy(\"queryid\").toPandas()\n",
    "        \n",
    "        self.query_num=len(queryids)\n",
    "        if self.query_num>0:\n",
    "            queryidx=queryids.reset_index()\n",
    "            queryidx['index']=queryidx['index']+1\n",
    "            #tpcds query\n",
    "            if self.query_num==103:\n",
    "                queryidx['index']=queryidx['index'].map(tpcds_query_map)\n",
    "            qidx=spark.createDataFrame(queryidx)\n",
    "            qidx=qidx.withColumnRenamed(\"index\",\"real_queryid\")\n",
    "            self.df=self.df.join(qidx,on=\"queryid\",how=\"left\")\n",
    "            if self.dfacc is not None:\n",
    "                self.dfacc=self.dfacc.join(qidx,on=\"queryid\",how='left')\n",
    "\n",
    "            if self.queryplans:\n",
    "                self.queryplans=self.queryplans.join(qidx,\"queryid\",how=\"right\")\n",
    "        \n",
    "        self.df=self.df.fillna(0)\n",
    "        self.df=self.df.withColumn('Executor ID',F.when(F.col(\"Executor ID\")==\"driver\",1).otherwise(F.col(\"Executor ID\")))\n",
    "        self.df.cache()\n",
    "        \n",
    "        \n",
    "        \n",
    "        ##############################\n",
    "        \n",
    "        dfx=self.df.where(\"Event='SparkListenerTaskEnd'\").select(\"Stage ID\",\"Launch Time\",\"Finish Time\",\"Task ID\")\n",
    "        dfxpds=dfx.toPandas()\n",
    "        dfxpds.columns=[l.replace(\" \",\"_\") for l in dfxpds.columns]\n",
    "        dfxpds_ods=sqldf('''select * from dfxpds order by finish_time desc''')\n",
    "        criticaltasks=[]\n",
    "        idx=0\n",
    "        prefinish=0\n",
    "        launchtime=dfxpds_ods[\"Launch_Time\"][0]\n",
    "        criticaltasks.append([dfxpds_ods[\"Task_ID\"][0],launchtime,dfxpds_ods[\"Finish_Time\"][0]])\n",
    "        total_row=len(dfxpds_ods)\n",
    "\n",
    "        while True:\n",
    "            while idx<total_row:\n",
    "                if dfxpds_ods[\"Finish_Time\"][idx]-2<launchtime:\n",
    "                    break\n",
    "                idx=idx+1\n",
    "            else:\n",
    "                break\n",
    "            cur_finish=dfxpds_ods[\"Finish_Time\"][idx]\n",
    "            cur_finish=launchtime-1 if cur_finish>=launchtime else cur_finish\n",
    "            launchtime=dfxpds_ods[\"Launch_Time\"][idx]\n",
    "            criticaltasks.append([dfxpds_ods[\"Task_ID\"][idx],launchtime,cur_finish])\n",
    "        self.criticaltasks=criticaltasks\n",
    "\n",
    "    def get_physical_plan(appals,**kwargs):\n",
    "        if appals.df is None:\n",
    "            appals.load_data()\n",
    "        queryid=kwargs.get('queryid',None)\n",
    "        shownops=kwargs.get(\"shownops\",['ArrowRowToColumnarExec','ColumnarToRow','RowToArrowColumnar',\n",
    "                                        'VeloxNativeColumnarToRowExec','ArrowColumnarToRow','Filter','HashAggregate','Project','SortAggregate','SortMergeJoin','window'])\n",
    "        \n",
    "        desensitization=kwargs.get('desensitization',True)\n",
    "        \n",
    "        def get_fields(colss):\n",
    "            lvls=0\n",
    "            colns=[]\n",
    "            ks=\"\"\n",
    "            for c in colss:\n",
    "                if c==\",\" and lvls==0:\n",
    "                    colns.append(ks)\n",
    "                    ks=\"\"\n",
    "                    continue\n",
    "                if c==\" \" and ks==\"\":\n",
    "                    continue\n",
    "                if c==\"(\":\n",
    "                    lvls+=1\n",
    "                if c==\")\":\n",
    "                    lvls-=1\n",
    "                ks+=c\n",
    "            if ks!=\"\":\n",
    "                colns.append(ks)\n",
    "            return colns\n",
    "        \n",
    "        def get_column_names(s, opname, resultname, prefix, columns, funcs):\n",
    "            p=re.search(r\" \"+opname+\" \",s[0])\n",
    "            if p:\n",
    "                for v in s[1].split(\"\\n\"):\n",
    "                    if v.startswith(resultname):\n",
    "                        cols=re.search(\"\\[([^0-9].+)\\]\",v)\n",
    "                        if cols:\n",
    "                            colss=cols.group(1)\n",
    "                            colns=get_fields(colss)\n",
    "                            if opname+str(len(columns)) not in funcs:\n",
    "                                funcs[opname+str(len(columns))]=[]\n",
    "                            funcs[opname+str(len(columns))].extend(colns)\n",
    "                            for c in colns:\n",
    "                                if \" AS \" in c:\n",
    "                                    c=re.sub(\"#\\d+L*\",\"\",c)\n",
    "                                    colname=re.search(r\" AS (.+)\",c).group(1)\n",
    "                                    if colname not in columns:\n",
    "                                        columns[colname]=prefix\n",
    "        \n",
    "        plans=appals.queryplans.select('real_queryid','physicalPlanDescription').collect() if queryid is None else appals.queryplans.where(f\"real_queryid='{queryid}'\").select(\"physicalPlanDescription\").collect()\n",
    "        \n",
    "        for pr in range(0,len(plans)):\n",
    "            plan=plans[pr]['physicalPlanDescription']\n",
    "            nodes={}\n",
    "            lines=plan.split(\"\\n\")\n",
    "            for idx in range(0,len(lines)):\n",
    "                l=lines[idx]\n",
    "                if l=='+- == Final Plan ==':\n",
    "                    while l!='+- == Initial Plan ==':\n",
    "                        idx+=1\n",
    "                        l=lines[idx]\n",
    "                        if not l.endswith(\")\"):\n",
    "                            break\n",
    "                        idv=re.search(\"\\(\\d+\\)$\",l).group(0)\n",
    "                        nodes[idv]=[l]\n",
    "                if l==\"== Physical Plan ==\":\n",
    "                    while not lines[idx+1].startswith(\"(\"):\n",
    "                        idx+=1\n",
    "                        l=lines[idx]\n",
    "                        if not l.endswith(\")\"):\n",
    "                            break\n",
    "                        idv=re.search(\"\\(\\d+\\)$\",l).group(0)\n",
    "                        nodes[idv]=[l]\n",
    "                        \n",
    "                if l.startswith(\"(\"):\n",
    "                    idv=re.search(\"^\\(\\d+\\)\",l).group(0)\n",
    "                    if idv in nodes:\n",
    "                        desc=\"\"\n",
    "                        while l.strip()!=\"\":\n",
    "                            desc+=l+\"\\n\"\n",
    "                            idx+=1\n",
    "                            l=lines[idx]\n",
    "                        desc=re.sub(r\"#\\d+L*\",r\"\",desc)\n",
    "                        desc=re.sub(r\"= [^)]+\",r\"=\",desc)\n",
    "                        desc=re.sub(r\"IN \\([^)]\\)\",r\"IN ()\",desc)\n",
    "                        desc=re.sub(r\"In\\([^)]\\)\",r\"In()\",desc)\n",
    "                        desc=re.sub(r\"EqualTo\\(([^,]+),[^)]+\\)\",r\"EqualTo(\\1,)\",desc)\n",
    "                        desc=re.sub(r\"搜索广告\",r\"xxx\",desc)\n",
    "                        ## add all keyword replace here\n",
    "                        nodes[idv].append(desc)\n",
    "            tables={}\n",
    "            columns={}\n",
    "            functions={}\n",
    "            for s in nodes.values():\n",
    "                p=re.search(r\"Scan arrow [^.]*\\.([^ ]+)\",s[0])\n",
    "                if p:\n",
    "                    tn=p.group(1)\n",
    "                    if not tn in tables:\n",
    "                        tables[tn]=\"table\"\n",
    "                    if desensitization:\n",
    "                        s[0]=s[0].replace(tn,tables[tn])\n",
    "                        s[1]=s[1].replace(tn,tables[tn])\n",
    "                    colsv=[]\n",
    "                    schema=[]\n",
    "                    for v in s[1].split(\"\\n\"):\n",
    "                        if v.startswith(\"ReadSchema\"):\n",
    "                            cols=re.search(\"<(.*)>\",v)\n",
    "                            if cols:\n",
    "                                colss=cols.group(1).split(\",\")\n",
    "                                for c in colss:\n",
    "                                    cts=c.split(\":\")\n",
    "                                    ct=cts[0]\n",
    "                                    if not ct in columns:\n",
    "                                        if len(cts)==2:\n",
    "                                            cts[1]=cts[1]\n",
    "                                            columns[ct]=cts[1]+\"_\"\n",
    "                                        else:\n",
    "                                            columns[ct]=\"c_\"\n",
    "                        if v.startswith(\"Location\") and desensitization:\n",
    "                            s[1]=s[1].replace(v+\"\\n\",\"\")\n",
    "                            \n",
    "                get_column_names(s, \"Project\", \"Output\", \"proj_\", columns, functions)\n",
    "                get_column_names(s, \"HashAggregate\", \"Results\", \"shagg_\", columns, functions)\n",
    "                get_column_names(s, \"SortAggregate\", \"Results\", \"stagg_\", columns, functions)\n",
    "                get_column_names(s, \"ColumnarConditionProject\", \"Arguments\", \"cproj_\", columns, functions)\n",
    "                get_column_names(s, \"ColumnarHashAggregate\", \"Results\", \"cshagg_\", columns, functions)\n",
    "                get_column_names(s, \"Window\", \"Arguments\", \"window_\", columns, functions)\n",
    "\n",
    "            keys=[]\n",
    "            ckeys=list(columns.keys())\n",
    "            for l in range(0,len(ckeys)):\n",
    "                k1=ckeys[l]\n",
    "                for k in range(0,len(keys)):\n",
    "                    if keys[k] in k1:\n",
    "                        keys.insert(k,k1)\n",
    "                        break\n",
    "                else:\n",
    "                    keys.append(k1)\n",
    "                \n",
    "            for s in nodes.values():\n",
    "                s[1]=html.escape(s[1])\n",
    "                if desensitization:\n",
    "                    for c in keys:\n",
    "                        v=columns[c]\n",
    "                        if v.startswith(\"array\") or v.startswith(\"map\") or v.startswith(\"struct\"):\n",
    "                            s[1]=re.sub(c, '<span style=\"color:red;background-color:yellow\">'+html.escape(v)+\"</span>\",s[1])\n",
    "                        else:\n",
    "                            s[1]=re.sub(c, \"<font color=#33cc33>\"+html.escape(v)+\"</font>\",s[1])\n",
    "\n",
    "\n",
    "            htmls=['''<table style=\"table-layout:fixed;max-width: 100%;\">''']\n",
    "            qid=pr+1 if queryid is None else queryid\n",
    "            htmls.append(f\"<tr><td colspan=2>{qid}</td></tr>\")\n",
    "            for l in nodes.values():\n",
    "                if shownops is not None:\n",
    "                    for k in shownops:\n",
    "                        if \" \"+k+\" \" in l[0]:\n",
    "                            break\n",
    "                    else:\n",
    "                        continue\n",
    "                htmls.append(\"<tr>\")\n",
    "                htmls.append('<td width=33%><div align=\"left\" style=\"font-family:Courier New;overflow-wrap: anywhere\">')\n",
    "                htmls.append(l[0].replace(\" \",\"_\")\n",
    "                             .replace(\"ColumnarToRow\",\"<font color=blue>ColumnarToRow</font>\")\n",
    "                             .replace(\"RowToArrowColumnar\",\"<font color=blue>RowToArrowColumnar</font>\")\n",
    "                             .replace(\"ArrowColumnarToRow\",\"<font color=blue>ArrowColumnarToRow</font>\")\n",
    "                             .replace(\"ArrowRowToColumnar\",\"<font color=blue>ArrowRowToColumnar</font>\")\n",
    "                             .replace(\"VeloxNativeColumnarToRowExec\",\"<font color=blue>VeloxNativeColumnarToRowExec</font>\")\n",
    "                            )\n",
    "                htmls.append(\"</div></td>\")\n",
    "                htmls.append('<td width=66%><div align=\"left\" style=\"font-family:Courier New;overflow-wrap: anywhere\">')\n",
    "                ls=l[1].split(\"\\n\")\n",
    "                lsx=[]\n",
    "                for t in ls:\n",
    "                    cols=re.search(\"\\[([^0-9].+)\\]\",t)\n",
    "                    if cols:\n",
    "                        colss=cols.group(1)\n",
    "                        colns=get_fields(colss)\n",
    "                        t=re.sub(\"\\[([^0-9].+)\\]\",\"\",t)\n",
    "                        t+=\"[\"+'<span style=\"background-color:#ededed;\">;</span>'.join(colns)+\"]\"                        \n",
    "                    if \":\" in t:\n",
    "                        lsx.append(re.sub(r'^([^:]+:)',r'<font color=blue>\\1</font>',t))\n",
    "                    else:\n",
    "                        lsx.append(t)\n",
    "                htmls.append(\"<br>\".join(lsx))\n",
    "                htmls.append(\"</div></td>\")\n",
    "                htmls.append(\"</tr>\")\n",
    "            htmls.append(\"</table>\")\n",
    "            display(HTML(\"\\n\".join(htmls)))\n",
    "            \n",
    "            for k, v in functions.items():\n",
    "                functions[k]=[l for l in v if \"(\" in l]\n",
    "            for f in functions.values():\n",
    "                for idx in range(0,len(f)):\n",
    "                    for c in keys:\n",
    "                        v=columns[c]\n",
    "                        if v.startswith(\"array\") or v.startswith(\"map\") or v.startswith(\"struct\"):\n",
    "                            f[idx]=re.sub(c, '<span style=\"color:red;background-color:yellow\">'+html.escape(v)+\"</span>\",f[idx])\n",
    "                        else:\n",
    "                            f[idx]=re.sub(c, \"<font color=#33cc33>\"+html.escape(v)+\"</font>\",f[idx])\n",
    "            funchtml=\"<table>\"\n",
    "            for k,v in functions.items():\n",
    "                if shownops is not None:\n",
    "                    for ks in shownops:\n",
    "                        if \" \"+ks+\" \" in k:\n",
    "                            break\n",
    "                    else:\n",
    "                        continue\n",
    "                funchtml+=\"<tr><td width=10%>\"+k+'</td><td width=90%><table stype=\"width:100%;table-layout:fixed\">'\n",
    "                for f in v:\n",
    "                    funchtml+='<tr><td width=100% ><div align=\"left\" style=\"font-family:Courier New\">'+f+\"</div></td></tr>\"\n",
    "                funchtml+=\"</table></td></tr>\"\n",
    "            funchtml+=\"</table>\"    \n",
    "            display(HTML(funchtml))\n",
    "        \n",
    "        return plans\n",
    "        \n",
    "    def get_physical_allnodes(appals,**kwargs):\n",
    "        if appals.df is None:\n",
    "            appals.load_data()\n",
    "        queryid=None\n",
    "        \n",
    "        plans=appals.queryplans.select('real_queryid','physicalPlanDescription').collect() if queryid is None else appals.queryplans.where(f\"real_queryid='{queryid}'\").select(\"physicalPlanDescription\").collect()\n",
    "        \n",
    "        allnodes={}\n",
    "        for pr in range(0,len(plans)):\n",
    "            plan=plans[pr]['physicalPlanDescription']\n",
    "            allnodes[pr]={}\n",
    "            nodes=allnodes[pr]\n",
    "            if plan is None:\n",
    "                continue\n",
    "            lines=plan.split(\"\\n\")\n",
    "            for idx in range(0,len(lines)):\n",
    "                l=lines[idx]\n",
    "                if l=='+- == Final Plan ==':\n",
    "                    while l!='+- == Initial Plan ==':\n",
    "                        idx+=1\n",
    "                        l=lines[idx]\n",
    "                        if not l.endswith(\")\"):\n",
    "                            break\n",
    "                        idv=re.search(\"\\(\\d+\\)$\",l).group(0)\n",
    "                        nodes[idv]=[l]\n",
    "                if l.startswith(\"(\"):\n",
    "                    idv=re.search(\"^\\(\\d+\\)\",l).group(0)\n",
    "                    if idv in nodes:\n",
    "                        desc=\"\"\n",
    "                        while l!=\"\":\n",
    "                            desc+=l+\"\\n\"\n",
    "                            idx+=1\n",
    "                            l=lines[idx]\n",
    "                        nodes[idv].append(desc)\n",
    "        return allnodes\n",
    "        \n",
    "        \n",
    "    def get_basic_state(appals):\n",
    "        if appals.df is None:\n",
    "            appals.load_data()\n",
    "        display(HTML(f\"<a href=http://{localhost}:18080/history/{appals.appid}>http://{localhost}:18080/history/{appals.appid}</a>\"))\n",
    "        \n",
    "        errorcolor=\"#000000\" if appals.executor_instances == appals.realexecutors else \"#c0392b\"\n",
    "        \n",
    "        qtime=appals.get_query_time(plot=False)\n",
    "        sums=qtime.sum()\n",
    "        \n",
    "        total_rchar,total_wchar,total_read_bytes,total_write_bytes,total_cancelled_write_bytes = getexecutor_stat(appals.file[:-len(\"app.log\")])\n",
    "        \n",
    "        if len(appals.failed_stages)>0:\n",
    "            failure=\"<br>\".join([\"query: \" + str(l[\"real_queryid\"])+\"|stage: \" + str(l[\"Stage ID\"]) for l in appals.df.where(\"`Stage ID` in (\"+\",\".join(appals.failed_stages)+\")\").select(\"real_queryid\",\"Stage ID\").distinct().collect()])\n",
    "        else:\n",
    "            failure=\"\"\n",
    "            \n",
    "        stats={\"appid\":appals.appid,\n",
    "            \"executor.instances\":appals.executor_instances,\n",
    "            \"executor.cores\":appals.executor_cores,\n",
    "            \"shuffle.partitions\":appals.parallelism,\n",
    "            \"batch size\":appals.batchsize,\n",
    "            \"real executors\":appals.realexecutors,\n",
    "            \"Failed Tasks\":failure,\n",
    "            \"Speculative Tasks\":appals.speculativetask,\n",
    "            \"Speculative Killed Tasks\":appals.speculativekilledtask,\n",
    "            \"Speculative Stage\":appals.speculativestage,\n",
    "            \"runtime\":round(sums['runtime'],2),\n",
    "            \"disk spilled\":round(sums['disk spilled'],2),\n",
    "            \"memspilled\":round(sums['memspilled'],2),\n",
    "            \"local_read\":round(sums['local_read'],2),\n",
    "            \"remote_read\":round(sums['remote_read'],2),\n",
    "            \"shuffle_write\":round(sums['shuffle_write'],2),\n",
    "            \"task run time\":round(sums['run_time'],2),\n",
    "            \"ser_time\":round(sums['ser_time'],2),\n",
    "            \"f_wait_time\":round(sums['f_wait_time'],2),\n",
    "            \"gc_time\":round(sums['gc_time'],2),\n",
    "            \"input read\":round(sums['input read'],2),\n",
    "            \"acc_task_time\":round(sums['acc_task_time'],2),\n",
    "            \"file read size\":round(total_rchar,2),\n",
    "            \"file write size\":round(total_wchar,2),\n",
    "            \"disk read size\":round(total_read_bytes,2),\n",
    "            \"disk write size\":round(total_write_bytes,2),\n",
    "            \"disk cancel size\":round(total_cancelled_write_bytes,2)\n",
    "            }\n",
    "        \n",
    "        display(HTML(f'''\n",
    "        <table border=\"1\" cellpadding=\"1\" cellspacing=\"1\" style=\"width:500px\">\n",
    "            <tbody>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">appid</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#000000\"><strong>{appals.appid}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">executor.instances</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#000000\"><strong>{appals.executor_instances}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">executor.cores</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#000000\"><strong>{appals.executor_cores}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">shuffle.partitions</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#000000\"><strong>{(appals.parallelism)}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">batch size</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#000000\"><strong>{(appals.batchsize):,}</strong></span></td>\n",
    "                </tr>                \n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">real executors</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:{errorcolor}\"><strong>{(appals.realexecutors)}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">Failed Tasks</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:{errorcolor}\"><strong>{(failure)}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">Speculative Tasks</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#87b00c\"><strong>{(appals.speculativetask)}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">Speculative Killed Tasks</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#87b00c\"><strong>{(appals.speculativekilledtask)}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">Speculative Stage</td>\n",
    "                    <td style=\"width:351px\"><span style=\"color:#87b00c\"><strong>{(appals.speculativestage)}</strong></span></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">runtime</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['runtime'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">disk spilled</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['disk spilled'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">memspilled</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['memspilled'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">local_read</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['local_read'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">remote_read</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['remote_read'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">shuffle_write</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['shuffle_write'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">task run time</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['run_time'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">ser_time</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['ser_time'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">f_wait_time</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['f_wait_time'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">gc_time</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['gc_time'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">input read</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['input read'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">acc_task_time</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(sums['acc_task_time'],2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">file read size</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(total_rchar,2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">file write size</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(total_wchar,2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">disk read size</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(total_read_bytes,2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">disk write size</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(total_write_bytes,2):,}</strong></td>\n",
    "                </tr>\n",
    "                <tr>\n",
    "                    <td style=\"width:135px\">disk cancel size</td>\n",
    "                    <td style=\"width:351px\"><strong>{round(total_cancelled_write_bytes,2):,}</strong></td>\n",
    "                </tr>\n",
    "            </tbody>\n",
    "        </table>\n",
    "\n",
    "        '''))\n",
    "        return stats\n",
    "   \n",
    "        \n",
    "    def generate_trace_view_list_exec(self,id=0,**kwargs):\n",
    "        Analysis.generate_trace_view_list(self,**kwargs)\n",
    "        showcpu=kwargs.get('showcpu',False)\n",
    "        shownodes=kwargs.get(\"shownodes\",None)\n",
    "        \n",
    "        showdf=self.df.where(F.col(\"Host\").isin(shownodes)) if shownodes else self.df\n",
    "        \n",
    "        events=showdf.toPandas()\n",
    "        coretrack={}\n",
    "        trace_events=[]\n",
    "        starttime=self.starttime\n",
    "        taskend=[]\n",
    "        trace={\"traceEvents\":[]}\n",
    "        exec_hosts={}\n",
    "        hostsdf=showdf.select(\"Host\").distinct().orderBy(\"Host\")\n",
    "        hostid=100000\n",
    "        ended_event=[]\n",
    "        \n",
    "        for i,l in hostsdf.toPandas().iterrows():\n",
    "            exec_hosts[l['Host']]=hostid\n",
    "            hostid=hostid+100000\n",
    "\n",
    "        for idx,l in events.iterrows():\n",
    "            if l['Event']=='SparkListenerTaskStart':\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "\n",
    "                tsk=l['Task ID']\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                self.pids.append(pid)\n",
    "                stime=l['Launch Time']\n",
    "                #the task's starttime and finishtime is the same, ignore it.\n",
    "                if tsk in ended_event:\n",
    "                    continue\n",
    "                if not pid in coretrack:\n",
    "                    tids={}\n",
    "                    trace_events.append({\n",
    "                       \"name\": \"process_name\",\n",
    "                       \"ph\": \"M\",\n",
    "                       \"pid\":pid,\n",
    "                       \"tid\":0,\n",
    "                       \"args\":{\"name\":\"{:s}.{:s}\".format(l['Host'],l['Executor ID'])}\n",
    "                      })\n",
    "\n",
    "                else:\n",
    "                    tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==-1:\n",
    "                        tids[t]=[tsk,stime]\n",
    "                        break\n",
    "                else:\n",
    "                    t=len(tids)\n",
    "                    tids[t]=[tsk,stime]\n",
    "                #print(\"task {:d} tid is {:s}.{:d}\".format(tsk,pid,t))\n",
    "                coretrack[pid]=tids\n",
    "\n",
    "            if l['Event']=='SparkListenerTaskEnd':\n",
    "                sevt={}\n",
    "                eevt={}\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                tsk=l['Task ID']\n",
    "                fintime=l['Finish Time']\n",
    "\n",
    "                tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==tsk:\n",
    "                        tids[t]=[-1,-1]\n",
    "                        break\n",
    "                else:\n",
    "                    ended_event.append(tsk)\n",
    "                    continue\n",
    "                for ps in reversed([key for key in tids.keys()]) :\n",
    "                    if tids[ps][1]-fintime<0 and tids[ps][1]-fintime>=-2:\n",
    "                        fintime=tids[ps][1]\n",
    "                        tids[t]=tids[ps]\n",
    "                        tids[ps]=[-1,-1]\n",
    "                        break\n",
    "                if starttime==0:\n",
    "                    starttime=l['Launch Time']\n",
    "                    print(f'applog start time: {starttime}')\n",
    "\n",
    "                sstime=l['Launch Time']-starttime\n",
    "\n",
    "                trace_events.append({\n",
    "                       'tid':pid+int(t),\n",
    "                       'ts':sstime,\n",
    "                       'dur':fintime-l['Launch Time'],\n",
    "                       'pid':pid,\n",
    "                       \"ph\":'X',\n",
    "                       'name':\"stg{:d}\".format(l['Stage ID']),\n",
    "                       'args':{\"job id\": l['job id'],\n",
    "                               \"stage id\": l['Stage ID'],\n",
    "                               \"tskid\":tsk,\n",
    "                               \"input\":builtins.round(l[\"Bytes Read\"]/1024/1024,2),\n",
    "                               \"spill\":builtins.round(l[\"Memory Bytes Spilled\"]/1024/1024,2),\n",
    "                               \"Shuffle Read Metrics\": \"\",\n",
    "                               \"|---Local Read\": builtins.round(l[\"Local Bytes Read\"]/1024/1024,2),\n",
    "                               \"|---Remote Read\":builtins.round(l[\"Remote Bytes Read\"]/1024/1024,2),\n",
    "                               \"Shuffle Write Metrics\": \"\",\n",
    "                               \"|---Write\":builtins.round(l['Shuffle Bytes Written']/1024/1024,2)\n",
    "                               }\n",
    "                      })\n",
    "\n",
    "                des_time=l['Executor Deserialize Time']\n",
    "                read_time=l['Fetch Wait Time']\n",
    "                exec_time=l['Executor Run Time']\n",
    "                write_time=math.floor(l['Shuffle Write Time']/1000000)\n",
    "                ser_time=l['Result Serialization Time']\n",
    "                getrst_time=l['Getting Result Time']\n",
    "                durtime=fintime-sstime-starttime;\n",
    "\n",
    "                times=[0,des_time,read_time,exec_time,write_time,ser_time,getrst_time]\n",
    "                time_names=['sched delay','deserialize time','read time','executor time','write time','serialize time','result time']\n",
    "                evttime=reduce((lambda x, y: x + y),times)\n",
    "                if evttime>durtime:\n",
    "                    times=[math.floor(l*1.0*durtime/evttime) for l in times]\n",
    "                else:\n",
    "                    times[0]=durtime-evttime\n",
    "\n",
    "                esstime=sstime\n",
    "                for idx in range(0,len(times)):\n",
    "                    if times[idx]>0:\n",
    "                        trace_events.append({\n",
    "                             'tid':pid+int(t),\n",
    "                             'ts':esstime,\n",
    "                             'dur':times[idx],                \n",
    "                             'pid':pid,\n",
    "                             'ph':'X',\n",
    "                             'name':time_names[idx]})\n",
    "                        if idx==3:\n",
    "                            trace_events.append({\n",
    "                                 'tid':pid+int(t),\n",
    "                                 'ts':esstime,\n",
    "                                 'dur':l['JVM GC Time'],\n",
    "                                 'pid':pid,\n",
    "                                 'ph':'X',\n",
    "                                 'name':'GC Time'})\n",
    "                            if showcpu:\n",
    "                                trace_events.append({\n",
    "                                     'tid':pid+int(t),\n",
    "                                     'ts':esstime,\n",
    "                                     'pid':pid,\n",
    "                                     'ph':'C',\n",
    "                                     'name':'cpu% {:d}'.format(pid+int(t)),\n",
    "                                     'args':{'value':l['Executor CPU Time']/1000000.0/times[idx]}})\n",
    "                                trace_events.append({\n",
    "                                     'tid':pid+int(t),\n",
    "                                     'ts':esstime+times[idx],\n",
    "                                     'pid':pid,\n",
    "                                     'ph':'C',\n",
    "                                     'name':'cpu% {:d}'.format(pid+int(t)),\n",
    "                                     'args':{'value':0}})\n",
    "                        esstime=esstime+times[idx]\n",
    "        self.starttime=starttime\n",
    "        return [json.dumps(l) for l in trace_events]\n",
    "\n",
    "    def generate_trace_view_list(self,id=0,**kwargs):\n",
    "        Analysis.generate_trace_view_list(self,**kwargs)\n",
    "        showcpu=kwargs.get('showcpu',False)\n",
    "        shownodes=kwargs.get(\"shownodes\",None)\n",
    "        \n",
    "        showdf=self.df.where(F.col(\"Host\").isin(shownodes)) if shownodes else self.df\n",
    "        \n",
    "        showdf=showdf.orderBy([\"eventtime\", \"Finish Time\"], ascending=[1, 0])\n",
    "        \n",
    "        events=showdf.drop(\"Accumulables\").toPandas()\n",
    "        coretrack={}\n",
    "        trace_events=[]\n",
    "        starttime=self.starttime\n",
    "        taskend=[]\n",
    "        trace={\"traceEvents\":[]}\n",
    "        exec_hosts={}\n",
    "        hostsdf=showdf.select(\"Host\").distinct().orderBy(\"Host\")\n",
    "        hostid=100000\n",
    "        ended_event=[]\n",
    "        \n",
    "        for i,l in hostsdf.toPandas().iterrows():\n",
    "            exec_hosts[l['Host']]=hostid\n",
    "            hostid=hostid+100000\n",
    "\n",
    "        tskmap={}\n",
    "        for idx,l in events.iterrows():\n",
    "            if l['Event']=='SparkListenerTaskStart':\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "\n",
    "                tsk=l['Task ID']\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                self.pids.append(pid)\n",
    "                stime=l['Launch Time']\n",
    "                #the task's starttime and finishtime is the same, ignore it.\n",
    "                if tsk in ended_event:\n",
    "                    continue\n",
    "                if not pid in coretrack:\n",
    "                    tids={}\n",
    "                    trace_events.append({\n",
    "                       \"name\": \"process_name\",\n",
    "                       \"ph\": \"M\",\n",
    "                       \"pid\":pid,\n",
    "                       \"tid\":0,\n",
    "                       \"args\":{\"name\":\"{:s}.{:s}\".format(l['Host'],l['Executor ID'])}\n",
    "                      })\n",
    "\n",
    "                else:\n",
    "                    tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==-1:\n",
    "                        tids[t]=[tsk,stime]\n",
    "                        break\n",
    "                else:\n",
    "                    t=len(tids)\n",
    "                    tids[t]=[tsk,stime]\n",
    "                #print(f\"task {tsk} tid is {pid}.{t}\")\n",
    "                coretrack[pid]=tids\n",
    "\n",
    "            if l['Event']=='SparkListenerTaskEnd':\n",
    "                sevt={}\n",
    "                eevt={}\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                tsk=l['Task ID']\n",
    "                fintime=l['Finish Time']\n",
    "                \n",
    "                tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==tsk:\n",
    "                        tids[t]=[-1,-1]\n",
    "                        break\n",
    "                else:\n",
    "                    ended_event.append(tsk)\n",
    "                    continue\n",
    "                for ps in reversed([key for key in tids.keys()]) :\n",
    "                    if tids[ps][1]-fintime<0 and tids[ps][1]-fintime>=-2:\n",
    "                        fintime=tids[ps][1]\n",
    "                        tids[t]=tids[ps]\n",
    "                        tids[ps]=[-1,-1]\n",
    "                        break\n",
    "                if starttime==0:\n",
    "                    starttime=l['Launch Time']\n",
    "                    print(f'applog start time: {starttime}')\n",
    "\n",
    "                sstime=l['Launch Time']-starttime\n",
    "\n",
    "                trace_events.append({\n",
    "                       'tid':pid+int(t),\n",
    "                       'ts':sstime,\n",
    "                       'dur':fintime-l['Launch Time'],\n",
    "                       'pid':pid,\n",
    "                       \"ph\":'X',\n",
    "                       'name':\"stg{:d}\".format(l['Stage ID']),\n",
    "                       'args':{\"job id\": l['Job ID'],\n",
    "                               \"stage id\": l['Stage ID'],\n",
    "                               \"tskid\":tsk,\n",
    "                               \"input\":builtins.round(l[\"Bytes Read\"]/1024/1024,2),\n",
    "                               \"spill\":builtins.round(l[\"Memory Bytes Spilled\"]/1024/1024,2),\n",
    "                               \"Shuffle Read Metrics\": \"\",\n",
    "                               \"|---Local Read\": builtins.round(l[\"Local Bytes Read\"]/1024/1024,2),\n",
    "                               \"|---Remote Read\":builtins.round(l[\"Remote Bytes Read\"]/1024/1024,2),\n",
    "                               \"Shuffle Write Metrics\": \"\",\n",
    "                               \"|---Write\":builtins.round(l['Shuffle Bytes Written']/1024/1024,2)\n",
    "                               }\n",
    "                      })\n",
    "                tskmap[tsk]={'pid':pid,'tid':pid+int(t)}\n",
    "\n",
    "        self.starttime=starttime\n",
    "        self.tskmap=tskmap\n",
    "        output=[json.dumps(l) for l in trace_events]\n",
    "        \n",
    "        df=self.df\n",
    "        \n",
    "        if showcpu and len(self.metricscollect)>0:\n",
    "            metricscollect=self.metricscollect\n",
    "            metrics_explode=df.where(\"Event='SparkListenerTaskEnd'\").withColumn(\"metrics\",F.explode(\"Accumulables\"))\n",
    "            m1092=metrics_explode.select(F.col(\"Executor ID\"),F.col(\"`Stage ID`\"),\"`Task ID`\",F.col(\"`Finish Time`\"),F.col(\"`Launch Time`\"),(F.col(\"`Finish Time`\")-F.col(\"`Launch Time`\")).alias(\"elapsedtime\"),\"metrics.*\").where(F.col(\"ID\").isin([l[0] for l in metricscollect]))\n",
    "            metric_name_df = spark.createDataFrame(metricscollect)\n",
    "            metric_name_df=metric_name_df.withColumnRenamed(\"_1\",\"ID\")\n",
    "            metric_name_df=metric_name_df.withColumnRenamed(\"_2\",\"unit\")\n",
    "            metric_name_df=metric_name_df.withColumnRenamed(\"_3\",\"mname\")\n",
    "\n",
    "            met_df=m1092.join(metric_name_df,on=\"ID\")\n",
    "            met_df=met_df.withColumn(\"Update\",F.when(F.col(\"unit\")=='nsTiming',F.col(\"Update\")/1000000).otherwise(F.col(\"Update\")+0))\n",
    "            met_df=met_df.where(\"Update>1\")\n",
    "\n",
    "            metdfx=met_df.groupBy(\"Task ID\",\"elapsedtime\").agg(F.sum(\"Update\").alias(\"totalCnt\"))\n",
    "            taskratio=metdfx.withColumn(\"ratio\",F.when(F.col(\"totalCnt\")<F.col(\"elapsedtime\"),1).otherwise(F.col(\"elapsedtime\")/F.col(\"totalCnt\"))).select(\"Task ID\",\"ratio\")\n",
    "            met_df=met_df.join(taskratio,on=\"Task ID\")\n",
    "            met_df=met_df.withColumn(\"Update\",F.col(\"Update\")*F.col(\"ratio\"))\n",
    "\n",
    "            w = (Window.partitionBy('Task ID').orderBy(F.desc(\"Update\")).rangeBetween(Window.unboundedPreceding, 0))\n",
    "            met_df=met_df.withColumn('cum_sum', F.sum('Update').over(w))\n",
    "\n",
    "            met_df=met_df.withColumn(\"starttime\",F.col(\"Launch Time\")+F.col(\"cum_sum\")-F.col(\"Update\"))\n",
    "\n",
    "            tskmapdf = spark.createDataFrame(pandas.DataFrame(self.tskmap).T.reset_index())\n",
    "            met_df=met_df.join(tskmapdf,on=[met_df[\"Task ID\"]==tskmapdf[\"index\"]])\n",
    "\n",
    "            rstdf=met_df.select(\n",
    "                F.col(\"tid\"),\n",
    "                F.round(F.col(\"starttime\")-self.starttime,0).alias(\"ts\"),\n",
    "                F.round(F.col(\"Update\"),0).alias(\"dur\"),\n",
    "                F.col(\"pid\"),\n",
    "                F.lit(\"X\").alias(\"ph\"),\n",
    "                F.col(\"mname\").alias(\"name\")\n",
    "            ).where(F.col(\"ts\").isNotNull()).orderBy('ts')\n",
    "\n",
    "            output.extend(rstdf.toJSON().collect())\n",
    "\n",
    "            qtime=df.where(\"Event='SparkListenerTaskEnd'\").groupBy(\"real_queryid\").agg(F.min(\"Finish Time\").alias(\"time\"))\n",
    "            output.extend(qtime.select(\n",
    "                F.lit(\"i\").alias(\"ph\"),\n",
    "                (F.col(\"time\")-starttime).alias('ts'),\n",
    "                F.lit(0).alias(\"pid\"),\n",
    "                F.lit(0).alias(\"tid\"),\n",
    "                F.lit(\"p\").alias(\"s\")\n",
    "            ).toJSON().collect())\n",
    "        \n",
    "        self.starttime=starttime\n",
    "        \n",
    "        if kwargs.get(\"show_criticalshow_time_metric_path\",True):\n",
    "            output.extend(self.generate_critical_patch_traceview(hostid-1))\n",
    "        \n",
    "        return output        \n",
    "\n",
    "    def generate_critical_patch_traceview(self,pid):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "        traces=[]\n",
    "        df=self.df.where(\"Event='SparkListenerTaskEnd' and real_queryid is not null\")\n",
    "        criticaltasks=self.criticaltasks\n",
    "        cripds=pandas.DataFrame(criticaltasks)\n",
    "        cripds.columns=['task_id',\"launch\",\"finish\"]\n",
    "        cridf=spark.createDataFrame(cripds)\n",
    "        df_ctsk=df.join(cridf,on=[F.col(\"task_id\")==F.col(\"Task ID\")],how=\"inner\")\n",
    "        traces.extend(df_ctsk.select(F.lit(38).alias(\"tid\"),\n",
    "                      (F.col(\"launch\")-F.lit(self.starttime)+1).alias(\"ts\"),\n",
    "                      (F.col(\"finish\")-F.col(\"launch\")-1).alias(\"dur\"),\n",
    "                      F.lit(pid).alias(\"pid\"),\n",
    "                      F.lit(\"X\").alias(\"ph\"),\n",
    "                      F.concat(F.lit(\"stg\"),F.col(\"Stage ID\")).alias(\"name\"),\n",
    "                      F.struct(\n",
    "                          F.col(\"Task ID\").alias('taskid'),\n",
    "                          F.col(\"Executor ID\").astype(IntegerType()).alias('exec_id'),\n",
    "                          F.col(\"Host\").alias(\"host\"),\n",
    "                          ).alias(\"args\")\n",
    "                        ).toJSON().collect())\n",
    "        traces.extend(df.groupBy(\"real_queryid\").agg(F.max(\"Finish Time\").alias(\"finish\"),F.min(\"Launch Time\").alias(\"launch\")).select(\n",
    "                        F.lit(38).alias(\"tid\"),\n",
    "                      (F.col(\"launch\")-F.lit(self.starttime)).alias(\"ts\"),\n",
    "                      (F.col(\"finish\")-F.col(\"launch\")).alias(\"dur\"),\n",
    "                      F.lit(pid).alias(\"pid\"),\n",
    "                      F.lit(\"X\").alias(\"ph\"),\n",
    "                      F.concat(F.lit(\"qry\"),F.col(\"real_queryid\")).alias(\"name\")).toJSON().collect())\n",
    "\n",
    "\n",
    "        metricscollect=self.metricscollect\n",
    "\n",
    "        metrics_explode=df_ctsk.where(\"Event='SparkListenerTaskEnd'\").withColumn(\"metrics\",F.explode(\"Accumulables\"))\n",
    "        m1092=metrics_explode.select(F.col(\"Executor ID\"),F.col(\"`Stage ID`\"),\"`Task ID`\",F.col(\"`Finish Time`\"),F.col(\"`Launch Time`\"),(F.col(\"`Finish Time`\")-F.col(\"`Launch Time`\")).alias(\"elapsedtime\"),\"metrics.*\").where(F.col(\"ID\").isin([l[0] for l in metricscollect]))\n",
    "        metric_name_df = spark.createDataFrame(metricscollect)\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_1\",\"ID\")\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_2\",\"unit\")\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_3\",\"mname\")\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_4\",\"node\")\n",
    "\n",
    "        metric_name_df=metric_name_df.where(\"mname <> 'time to collect batch' and mname <> 'time of scan'\")\n",
    "\n",
    "        met_df=m1092.join(metric_name_df,on=\"ID\")\n",
    "        met_df=met_df.withColumn(\"Update\",F.when(F.col(\"unit\")=='nsTiming',F.col(\"Update\")/1000000).otherwise(F.col(\"Update\")+0))\n",
    "        \n",
    "        #pandas UDF doesn't work. hang\n",
    "        #tmbk=met_df.groupBy('Task ID').apply(time_breakdown)\n",
    "        \n",
    "        w=Window.partitionBy('Task ID')\n",
    "        met_df1=met_df.withColumn(\"sum_update\",F.sum(\"Update\").over(w))\n",
    "        met_df2=met_df1.withColumn(\"ratio\",(F.col(\"Finish Time\")-F.col(\"Launch Time\")-2)/F.col(\"sum_update\"))\n",
    "        met_df3=met_df2.withColumn(\"ratio\",F.when(F.col(\"ratio\")>1,1).otherwise(F.col(\"ratio\")))\n",
    "        met_df4=met_df3.withColumn(\"update_ratio\",F.floor(F.col(\"ratio\")*F.col(\"Update\")))\n",
    "        met_df5=met_df4.where(F.col(\"update_ratio\")>2)\n",
    "        w = (Window.partitionBy('Task ID').orderBy(F.desc(\"update_ratio\")).rowsBetween(Window.unboundedPreceding, Window.currentRow))\n",
    "        met_df6=met_df5.withColumn('ltime_dur', F.sum('update_ratio').over(w))\n",
    "        met_df8=met_df6.withColumn(\"ltime\",F.col(\"ltime_dur\")+F.col(\"Launch Time\")-F.col(\"update_ratio\"))\n",
    "\n",
    "        tmbk=met_df8.withColumn(\"taskid\",F.col(\"Task ID\")).withColumn(\"start\",F.col(\"ltime\")+F.lit(1)).withColumn(\"dur\",F.col(\"update_ratio\")-F.lit(1)).withColumn(\"name\",F.col(\"mname\"))\n",
    "        \n",
    "        \n",
    "        traces.extend(tmbk.select(\n",
    "                        F.lit(38).alias(\"tid\"),\n",
    "                      (F.col(\"start\")-F.lit(self.starttime)).alias(\"ts\"),\n",
    "                      (F.col(\"dur\")).alias(\"dur\"),\n",
    "                      F.lit(pid).alias(\"pid\"),\n",
    "                      F.lit(\"X\").alias(\"ph\"),\n",
    "                      F.col(\"name\").alias(\"name\")).toJSON().collect())\n",
    "        traces.append(json.dumps({\n",
    "                       \"name\": \"process_name\",\n",
    "                       \"ph\": \"M\",\n",
    "                       \"pid\":pid,\n",
    "                       \"tid\":0,\n",
    "                       \"args\":{\"name\":\"critical path\"}\n",
    "                      }))\n",
    "        return traces    \n",
    "    \n",
    "    def show_Stage_histogram(apps,stageid,bincount):\n",
    "        if apps.df is None:\n",
    "            apps.load_data()\n",
    "        \n",
    "        inputsize = apps.df.where(\"`Stage ID`={:d}\".format(stageid)).select(\"Stage ID\",\"Executor ID\", \"Task ID\", F.explode(\"Accumulables\")) \\\n",
    "                      .select(\"Stage ID\",\"Executor ID\", \"Task ID\",\"col.*\") \\\n",
    "                      .where(\"Name='input size in bytes' or Name='size of files read'\") \\\n",
    "                      .groupBy(\"Task ID\") \\\n",
    "                      .agg((F.sum(\"Update\")).alias(\"input read\"))\n",
    "\n",
    "\n",
    "        stage37=apps.df.where(\"`Stage ID`={:d} and event='SparkListenerTaskEnd'\".format(stageid) )\\\n",
    "                        .join(inputsize,on=[\"Task ID\"],how=\"left\")\\\n",
    "                        .fillna(0) \\\n",
    "                        .select(F.col('Host'), \n",
    "                                F.round((F.col('Finish Time')/1000-F.col('Launch Time')/1000),2).alias('elapsedtime'),\n",
    "                                F.round((F.col('`input read`')+F.col('`Bytes Read`')+F.col('`Local Bytes Read`')+F.col('`Remote Bytes Read`'))/1024/1024,2).alias('input'))\n",
    "        stage37=stage37.cache()\n",
    "        hist_elapsedtime=stage37.select('elapsedtime').rdd.flatMap(lambda x: x).histogram(15)\n",
    "        hist_input=stage37.select('input').rdd.flatMap(lambda x: x).histogram(15)\n",
    "        fig, axs = plt.subplots(figsize=(30, 5),nrows=1, ncols=2)\n",
    "        ax=axs[0]\n",
    "        binSides, binCounts = hist_elapsedtime\n",
    "        binSides=[builtins.round(l,2) for l in binSides]\n",
    "\n",
    "        N = len(binCounts)\n",
    "        ind = numpy.arange(N)\n",
    "        width = 0.5\n",
    "\n",
    "        rects1 = ax.bar(ind+0.5, binCounts, width, color='b')\n",
    "\n",
    "        ax.set_ylabel('Frequencies')\n",
    "        ax.set_title('stage{:d} elapsed time breakdown'.format(stageid))\n",
    "        ax.set_xticks(numpy.arange(N+1))\n",
    "        ax.set_xticklabels(binSides)\n",
    "\n",
    "        ax=axs[1]\n",
    "        binSides, binCounts = hist_input\n",
    "        binSides=[builtins.round(l,2) for l in binSides]\n",
    "\n",
    "        N = len(binCounts)\n",
    "        ind = numpy.arange(N)\n",
    "        width = 0.5\n",
    "        rects1 = ax.bar(ind+0.5, binCounts, width, color='b')\n",
    "\n",
    "        ax.set_ylabel('Frequencies')\n",
    "        ax.set_title('stage{:d} input data breakdown'.format(stageid))\n",
    "        ax.set_xticks(numpy.arange(N+1))\n",
    "        ax.set_xticklabels(binSides)\n",
    "\n",
    "        out=stage37\n",
    "        outpds=out.toPandas()\n",
    "\n",
    "        fig, axs = plt.subplots(nrows=1, ncols=3, sharey=False,figsize=(30,8),gridspec_kw = {'width_ratios':[1, 1, 1]})\n",
    "        plt.subplots_adjust(wspace=0.01)\n",
    "\n",
    "        groups= outpds.groupby('Host')\n",
    "        for name, group in groups:\n",
    "            axs[0].plot(group.input, group.elapsedtime, marker='o', linestyle='', ms=5, label=name)\n",
    "        axs[0].set_xlabel('input size (MB)')\n",
    "        axs[0].set_ylabel('elapsed time (s)')\n",
    "\n",
    "        axs[0].legend()\n",
    "\n",
    "        axs[0].get_shared_y_axes().join(axs[0], axs[1])\n",
    "\n",
    "        sns.violinplot(y='elapsedtime', x='Host', data=outpds,palette=['g'],ax=axs[1])\n",
    "\n",
    "        sns.violinplot(y='input', x='Host', data=outpds,palette=['g'],ax=axs[2])\n",
    "\n",
    "        #ax.xaxis.set_major_formatter(mtick.FormatStrFormatter(''))\n",
    "        #ax.yaxis.set_major_formatter(mtick.FormatStrFormatter(''))\n",
    "\n",
    "        if False:\n",
    "            out=stage37\n",
    "            vecAssembler = VectorAssembler(inputCols=[\"input\",'elapsedtime'], outputCol=\"features\").setHandleInvalid(\"skip\")\n",
    "            new_df = vecAssembler.transform(out)\n",
    "            kmeans = KMeans(k=2, seed=1)  # 2 clusters here\n",
    "            model = kmeans.fit(new_df.select('features'))\n",
    "            transformed = model.transform(new_df)\n",
    "\n",
    "\n",
    "            outpds=transformed.select('Host','elapsedtime','input','prediction').toPandas()\n",
    "\n",
    "            fig, axs = plt.subplots(nrows=1, ncols=2, sharey=False,figsize=(30,8),gridspec_kw = {'width_ratios':[1, 1]})\n",
    "            plt.subplots_adjust(wspace=0.01)\n",
    "\n",
    "            groups= outpds.groupby('prediction')\n",
    "            for name, group in groups:\n",
    "                axs[0].plot(group.input, group.elapsedtime, marker='o', linestyle='', ms=5, label=name)\n",
    "            axs[0].legend()\n",
    "\n",
    "            bars=transformed.where('prediction=1').groupBy(\"Host\").count().toPandas()\n",
    "\n",
    "            axs[1].bar(bars['Host'], bars['count'], 0.4, color='coral')\n",
    "            axs[1].set_title('cluster=1')\n",
    "\n",
    "        plt.show()\n",
    "        \n",
    "    def show_Stages_hist(apps,**kwargs):\n",
    "        if apps.df is None:\n",
    "            apps.load_data()\n",
    "        \n",
    "        bincount=kwargs.get(\"bincount\",15)\n",
    "        threshold=kwargs.get(\"threshold\",0.9)\n",
    "        \n",
    "        query=kwargs.get(\"queryid\",None)\n",
    "        if query and type(query)==int:\n",
    "            query = [query,]\n",
    "        df=apps.df.where(F.col(\"real_queryid\").isin(query)) if query else apps.df\n",
    "        \n",
    "        totaltime=df.where(\"event='SparkListenerTaskEnd'\" ).agg(F.sum(F.col('Finish Time')-F.col('Launch Time')).alias('total_time')).collect()[0]['total_time']\n",
    "        stage_time=df.where(\"event='SparkListenerTaskEnd'\" ).groupBy('`Stage ID`').agg(F.sum(F.col('Finish Time')-F.col('Launch Time')).alias('total_time')).orderBy('total_time', ascending=False).toPandas()\n",
    "        stage_time['acc_total'] = stage_time['total_time'].cumsum()/totaltime\n",
    "        stage_time=stage_time.reset_index()\n",
    "        fig, ax = plt.subplots(figsize=(30, 5))\n",
    "\n",
    "        rects1 = ax.plot(stage_time['index'],stage_time['acc_total'],'b.-')\n",
    "        ax.set_xticks(stage_time['index'])\n",
    "        ax.set_xticklabels(stage_time['Stage ID'])\n",
    "        ax.set_xlabel('stage')\n",
    "        ax.grid(which='major', axis='x')\n",
    "        plt.show()\n",
    "        shownstage=[]\n",
    "        for x in stage_time.index:\n",
    "            if stage_time['acc_total'][x]<=threshold:\n",
    "                shownstage.append(stage_time['Stage ID'][x])\n",
    "            else:\n",
    "                shownstage.append(stage_time['Stage ID'][x])\n",
    "                break\n",
    "        for row in shownstage:\n",
    "            apps.show_Stage_histogram(row,bincount) \n",
    "            \n",
    "    def get_hottest_stages(apps,**kwargs):\n",
    "        if apps.df is None:\n",
    "            apps.load_data()\n",
    "        \n",
    "        bincount=kwargs.get(\"bincount\",15)\n",
    "        threshold=kwargs.get(\"threshold\",0.9)\n",
    "        plot=kwargs.get(\"plot\",True)\n",
    "        \n",
    "        query=kwargs.get(\"queryid\",None)\n",
    "        if query and type(query)==int:\n",
    "            query = [query,]\n",
    "        df=apps.df.where(F.col(\"real_queryid\").isin(query)) if query else apps.df.where(\"queryid is not NULL\")\n",
    "\n",
    "        stage_time=df.where(\"event='SparkListenerTaskEnd'\" ).groupBy('`Stage ID`','Job ID','real_queryid').agg(\n",
    "            F.sum(F.col('Finish Time')-F.col('Launch Time')).alias('total_time'),\n",
    "            F.stddev(F.col('Finish Time')/1000-F.col('Launch Time')/1000).alias('stdev_time'),\n",
    "            F.count(\"*\").alias(\"cnt\"),\n",
    "            F.first('queryid').astype(IntegerType()).alias('queryid')\n",
    "            )\\\n",
    "            .select('`Stage ID`','Job ID','real_queryid','queryid',\n",
    "                    (F.col(\"total_time\")/1000/(F.when(F.col(\"cnt\")>F.lit(apps.executor_instances*apps.executor_cores/apps.taskcpus),F.lit(apps.executor_instances*apps.executor_cores/apps.taskcpus)).otherwise(F.col(\"cnt\")))).alias(\"total_time\"),\n",
    "                    F.col(\"stdev_time\")\n",
    "                   ).orderBy('total_time', ascending=False).toPandas()\n",
    "\n",
    "        totaltime=stage_time['total_time'].sum()\n",
    "        stage_time['acc_total'] = stage_time['total_time'].cumsum()/totaltime\n",
    "        stage_time['total'] = stage_time['total_time']/totaltime\n",
    "        stage_time=stage_time.reset_index()\n",
    "\n",
    "        shownstage=stage_time.loc[stage_time['acc_total'] <=threshold]\n",
    "        shownstage['stg']=shownstage['real_queryid'].astype(str)+'_'+shownstage['Job ID'].astype(str)+'_'+shownstage['Stage ID'].astype(str)\n",
    "        if plot:\n",
    "            shownstage.plot.bar(x=\"stg\",y=\"total\",figsize=(30,8))\n",
    "\n",
    "\n",
    "\n",
    "        norm = matplotlib.colors.Normalize(vmin=0, vmax=max(stage_time.queryid))\n",
    "        cmap = matplotlib.cm.get_cmap('brg')\n",
    "        def setbkcolor(x):\n",
    "            rgba=cmap(norm(x['queryid']))\n",
    "            return ['background-color:rgba({:d},{:d},{:d},1); color:white'.format(int(rgba[0]*255),int(rgba[1]*255),int(rgba[2]*255))]*9\n",
    "\n",
    "        if plot:\n",
    "            display(stage_time.style.apply(setbkcolor,axis=1).format({\"total_time\":lambda x: '{:,.2f}'.format(x),\"acc_total\":lambda x: '{:,.2%}'.format(x),\"total\":lambda x: '{:,.2%}'.format(x)}))\n",
    "        \n",
    "        return stage_time\n",
    "\n",
    "    def scatter_elapsetime_input(apps,stageid):\n",
    "        if apps.df is None:\n",
    "            apps.load_data()\n",
    "        stage37=apps.df.where(\"`Stage ID`={:d} and event='SparkListenerTaskEnd'\".format(stageid) ).select(F.round((F.col('Finish Time')/1000-F.col('Launch Time')/1000),2).alias('elapsedtime'),F.round((F.col('`Bytes Read`')+F.col('`Local Bytes Read`')+F.col('`Remote Bytes Read`'))/1024/1024,2).alias('input')).toPandas()\n",
    "        stage37.plot.scatter('input','elapsedtime',figsize=(30, 5))\n",
    "\n",
    "    def get_critical_path_stages(self):     \n",
    "        df=self.df.where(\"Event='SparkListenerTaskEnd'\")\n",
    "        criticaltasks=self.criticaltasks\n",
    "        cripds=pandas.DataFrame(criticaltasks)\n",
    "        cripds.columns=['task_id',\"launch\",\"finish\"]\n",
    "        cridf=spark.createDataFrame(cripds)\n",
    "        df_ctsk=df.join(cridf,on=[F.col(\"task_id\")==F.col(\"Task ID\")],how=\"inner\")\n",
    "        df_ctsk=df_ctsk.withColumn(\"elapsed\",(F.col(\"Finish Time\")-F.col(\"Launch Time\"))/1000)\n",
    "        return df_ctsk.where(\"elapsed>10\").orderBy(F.desc(\"elapsed\")).select(\"real_queryid\",F.round(\"elapsed\",2).alias(\"elapsed\"),\"Host\",\"executor ID\",\"Stage ID\",\"Task ID\",F.round(F.col(\"Bytes Read\")/1000000,0).alias(\"file read\"),F.round((F.col(\"Local Bytes Read\")+F.col(\"Remote Bytes Read\"))/1000000,0).alias(\"shuffle read\")).toPandas()\n",
    "        \n",
    "    def show_time_metric(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "        shownodes=kwargs.get(\"shownodes\",None)\n",
    "        query=kwargs.get(\"queryid\",None)\n",
    "        plot=kwargs.get(\"plot\",True)\n",
    "        taskids=kwargs.get(\"taskids\",None)\n",
    "        \n",
    "        if query and type(query)==int:\n",
    "            query = [query,]\n",
    "        \n",
    "        showexecutor=kwargs.get(\"showexecutor\",True) if not taskids else False\n",
    "        queryid = query[0] if query else 0\n",
    "        \n",
    "        df=self.df.where(F.col(\"Host\").isin(shownodes)) if shownodes else self.df\n",
    "        df=df.where(F.col(\"real_queryid\").isin(query)) if query else df.where(\"queryid is not NULL\")\n",
    "\n",
    "        df=df.where(F.col(\"Task ID\").isin(taskids)) if taskids else df\n",
    "\n",
    "        exec_cores=1 if taskids else self.executor_cores\n",
    "        execs=1 if taskids else self.executor_instances\n",
    "\n",
    "        metricscollect=self.metricscollect\n",
    "\n",
    "        metrics_explode=df.where(\"Event='SparkListenerTaskEnd'\").withColumn(\"metrics\",F.explode(\"Accumulables\"))\n",
    "        m1092=metrics_explode.select(F.col(\"Executor ID\"),F.col(\"`Stage ID`\"),\"`Task ID`\",F.col(\"`Finish Time`\"),F.col(\"`Launch Time`\"),(F.col(\"`Finish Time`\")-F.col(\"`Launch Time`\")).alias(\"elapsedtime\"),\"metrics.*\").where(F.col(\"ID\").isin([l[0] for l in metricscollect]))\n",
    "        metric_name_df = spark.createDataFrame(metricscollect)\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_1\",\"ID\")\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_2\",\"unit\")\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_3\",\"mname\")\n",
    "        metric_name_df=metric_name_df.withColumnRenamed(\"_4\",\"node\")\n",
    "\n",
    "        runtime=metrics_explode.agg(F.round(F.max(\"Finish Time\")/1000-F.min(\"Launch Time\")/1000,2).alias(\"runtime\")).collect()[0][\"runtime\"]\n",
    "\n",
    "        met_df=m1092.join(metric_name_df,on=\"ID\")\n",
    "        met_df=met_df.withColumn(\"Update\",F.when(F.col(\"unit\")=='nsTiming',F.col(\"Update\")/1000000).otherwise(F.col(\"Update\")+0))\n",
    "        outpdf=met_df.groupBy(\"`Executor ID`\",\"mname\").sum(\"Update\").orderBy(\"Executor ID\").toPandas()\n",
    "\n",
    "        met_time_cnt=df.where(\"Event='SparkListenerTaskEnd'\")\n",
    "        exectime=met_time_cnt.groupBy(\"Executor ID\").agg((F.max(\"Finish Time\")-F.min(\"Launch Time\")).alias(\"totaltime\"),F.sum(F.col(\"`Finish Time`\")-F.col(\"`Launch Time`\")).alias(\"tasktime\"))\n",
    "\n",
    "        totaltime_query=met_time_cnt.groupBy(\"real_queryid\").agg((F.max(\"Finish Time\")-F.min(\"Launch Time\")).alias(\"totaltime\")).agg(F.sum(\"totaltime\").alias(\"totaltime\")).collect()\n",
    "        totaltime_query=totaltime_query[0][\"totaltime\"]\n",
    "        \n",
    "        pdf=exectime.toPandas()\n",
    "        exeids=set(outpdf['Executor ID'])\n",
    "        outpdfs=[outpdf[outpdf[\"Executor ID\"]==l] for l in exeids]\n",
    "        tasktime=pdf.set_index(\"Executor ID\").to_dict()['tasktime']\n",
    "\n",
    "        def comb(l,r):\n",
    "            execid=list(r['Executor ID'])[0]\n",
    "            lp=r[['mname','sum(Update)']]\n",
    "            lp.columns=[\"mname\",\"val_\"+execid]\n",
    "            idle=totaltime_query*exec_cores-tasktime[execid]\n",
    "            nocount=tasktime[execid]-sum(lp[\"val_\"+execid])\n",
    "            if idle<0:\n",
    "                idle=0\n",
    "            if nocount<0:\n",
    "                nocount=0\n",
    "            lp=lp.append([{\"mname\":\"idle\",\"val_\"+execid:idle}])\n",
    "            lp=lp.append([{\"mname\":\"not_counted\",\"val_\"+execid:nocount}])\n",
    "            if l is not None:\n",
    "                return pandas.merge(lp, l,on=[\"mname\"],how='outer')\n",
    "            else:\n",
    "                return lp\n",
    "\n",
    "        rstpdf=None\n",
    "        for l in outpdfs[0:]:\n",
    "            rstpdf=comb(rstpdf,l)\n",
    "            \n",
    "        for l in [l for l in rstpdf.columns if l!=\"mname\"]:\n",
    "            rstpdf[l]=rstpdf[l]/1000/exec_cores\n",
    "    \n",
    "        rstpdf=rstpdf.sort_values(by=\"val_\"+list(exeids)[0],axis=0,ascending=False)\n",
    "        if showexecutor and plot:\n",
    "            rstpdf.set_index(\"mname\").T.plot.bar(stacked=True,figsize=(30,8))\n",
    "        pdf_sum=pandas.DataFrame(rstpdf.set_index(\"mname\").T.sum())\n",
    "        totaltime=totaltime_query/1000\n",
    "        pdf_sum[0]=pdf_sum[0]/(execs)\n",
    "        pdf_sum[0][\"idle\"]=(totaltime_query-sum(tasktime.values())/execs/exec_cores)/1000\n",
    "        pdf_sum=pdf_sum.sort_values(by=0,axis=0,ascending=False)\n",
    "        pdf_sum=pdf_sum.T\n",
    "        pdf_sum.columns=[\"{:>2.0f}%_{:s}\".format(pdf_sum[l][0]/totaltime*100,l) for l in pdf_sum.columns]\n",
    "        matplotlib.rcParams['font.sans-serif'] = \"monospace\"\n",
    "        matplotlib.rcParams['font.family'] = \"monospace\"\n",
    "        import matplotlib.font_manager as font_manager\n",
    "        if plot:\n",
    "            ax=pdf_sum.plot.bar(stacked=True,figsize=(30,8))\n",
    "            font = font_manager.FontProperties(family='monospace',\n",
    "                                               style='normal', size=14)\n",
    "            ax.legend(prop=font,loc=4)\n",
    "            plt.title(\"{:s} q{:d} executors={:d} cores_per_executor={:d} parallelism={:d} sumtime={:.0f} runtime={:.0f}\".format(self.file.split(\"/\")[2],queryid,self.executor_instances,self.executor_cores,self.parallelism,totaltime,runtime),fontdict={'fontsize':24})\n",
    "        return pdf_sum\n",
    "\n",
    "    def show_critical_path_time_breakdown(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "        return self.show_time_metric(taskids=[l[0].item() for l in self.criticaltasks])\n",
    "    \n",
    "    def get_spark_config(self):\n",
    "        df=spark.read.json(self.file)\n",
    "        self.appid=df.where(\"`App ID` is not null\").collect()[0][\"App ID\"]\n",
    "        pandas.set_option('display.max_rows', None)\n",
    "        pandas.set_option('display.max_columns', None)\n",
    "        pandas.set_option('display.max_colwidth', 100000)\n",
    "        return df.select(\"Properties.*\").where(\"`spark.app.id` is not null\").limit(1).toPandas().T\n",
    "    \n",
    "    def get_app_name(self):\n",
    "        cfg=self.get_spark_config()\n",
    "        display(HTML(\"<font size=5 color=red>\" + cfg.loc[cfg.index=='spark.app.name'][0][0]+\"</font>\"))\n",
    "        \n",
    "        \n",
    "    def get_query_time(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "        queryid=kwargs.get(\"queryid\",None)\n",
    "        showtable=kwargs.get(\"showtable\",True)\n",
    "        plot=kwargs.get(\"plot\",True)\n",
    "        \n",
    "        if queryid and type(queryid)==int:\n",
    "            queryid = [queryid,]\n",
    "           \n",
    "        df=self.df.where(F.col(\"real_queryid\").isin(queryid)) if queryid else self.df.where(\"queryid is not NULL\")\n",
    "        \n",
    "            \n",
    "        stages=df.select(\"real_queryid\",\"Stage ID\").distinct().orderBy(\"Stage ID\").groupBy(\"real_queryid\").agg(F.collect_list(\"Stage ID\").alias(\"stages\")).orderBy(\"real_queryid\")\n",
    "        runtimeacc=df.where(\"Event='SparkListenerTaskEnd'\") \\\n",
    "                      .groupBy(\"real_queryid\") \\\n",
    "                      .agg(F.round(F.sum(F.col(\"Finish Time\")-F.col(\"Launch Time\"))/1000/self.executor_instances/self.executor_cores*self.taskcpus,2).alias(\"acc_task_time\"))\n",
    "        inputsize = df.select(\"real_queryid\",\"Stage ID\",\"Executor ID\", \"Task ID\", F.explode(\"Accumulables\")) \\\n",
    "                      .select(\"real_queryid\",\"Stage ID\",\"Executor ID\", \"Task ID\",\"col.*\") \\\n",
    "                      .where(\"Name='input size in bytes' or Name='size of files read'\") \\\n",
    "                      .groupBy(\"real_queryid\") \\\n",
    "                      .agg(F.round(F.sum(\"Update\")/1024/1024/1024,2).alias(\"input read\")).orderBy(\"real_queryid\")\n",
    "        if self.dfacc is not None:\n",
    "            inputsizev1 = self.dfacc.where(\"Name='size of files read'\").groupBy(\"real_queryid\").agg(F.round(F.sum(\"Update\")/1024/1024/1024,2).alias(\"input read v1\")).orderBy(\"real_queryid\")\n",
    "            inputsize=inputsize.join(inputsizev1,on=\"real_queryid\",how=\"outer\")\n",
    "            inputsize=inputsize.withColumn(\"input read\",F.coalesce(F.col(\"input read\"),F.col(\"input read v1\"))).drop(\"input read v1\")\n",
    "        \n",
    "        outputrows = df.select(\"real_queryid\",\"Stage ID\",\"Stage ID\",F.explode(\"Accumulables\"))\\\n",
    "                        .select(\"real_queryid\",\"Stage ID\",\"Stage ID\",\"col.*\")\\\n",
    "                        .where(\"Name='number of output rows'\")\\\n",
    "                        .groupBy(\"real_queryid\")\\\n",
    "                        .agg(F.round(F.sum(\"Update\")/1000000000,2).alias(\"output rows\"))\n",
    "        \n",
    "        stages=runtimeacc.join(stages,on=\"real_queryid\",how=\"left\")\n",
    "        stages=inputsize.join(stages,on=\"real_queryid\",how=\"left\")\n",
    "        stages=stages.join(outputrows,on='real_queryid',how=\"left\")\n",
    "        \n",
    "        out=df.groupBy(\"real_queryid\").agg(\n",
    "            F.round(F.max(\"query_endtime\")/1000-F.min(\"query_starttime\")/1000,2).alias(\"runtime\"),\n",
    "            F.round(F.sum(\"Disk Bytes Spilled\")/1024/1024/1024,2).alias(\"disk spilled\"),\n",
    "            F.round(F.sum(\"Memory Bytes Spilled\")/1024/1024/1024,2).alias(\"memspilled\"),\n",
    "            F.round(F.sum(\"Local Bytes Read\")/1024/1024/1024,2).alias(\"local_read\"),\n",
    "            F.round(F.sum(\"Remote Bytes Read\")/1024/1024/1024,2).alias(\"remote_read\"),\n",
    "            F.round(F.sum(\"Shuffle Bytes Written\")/1024/1024/1024,2).alias(\"shuffle_write\"),\n",
    "            F.round(F.sum(\"Executor Deserialize Time\")/1000/self.parallelism,2).alias(\"deser_time\"),\n",
    "            F.round(F.sum(\"Executor Run Time\")/1000/self.parallelism,2).alias(\"run_time\"),\n",
    "            F.round(F.sum(\"Result Serialization Time\")/1000/self.parallelism,2).alias(\"ser_time\"),\n",
    "            F.round(F.sum(\"Fetch Wait Time\")/1000/self.parallelism,2).alias(\"f_wait_time\"),\n",
    "            F.round(F.sum(\"JVM GC Time\")/1000/self.parallelism,2).alias(\"gc_time\"),\n",
    "            F.round(F.max(\"Peak Execution Memory\")/1000000000*self.executor_instances*self.executor_cores,2).alias(\"peak_mem\"),\n",
    "            F.max(\"queryid\").alias(\"queryid\")\n",
    "            ).join(stages,\"real_queryid\",how=\"left\").orderBy(\"real_queryid\").toPandas().set_index(\"real_queryid\")\n",
    "        out[\"executors\"]=self.executor_instances\n",
    "        out[\"core/exec\"]=self.executor_cores\n",
    "        out[\"task.cpus\"]=self.taskcpus\n",
    "        out['parallelism']=self.parallelism\n",
    "        \n",
    "        if not showtable:\n",
    "            return out\n",
    "\n",
    "        def highlight_greater(x):\n",
    "            m1 = x['acc_task_time'] / x['runtime'] * 100\n",
    "            m2 = x['run_time'] / x['runtime'] * 100\n",
    "            m3 = x['f_wait_time'] / x['runtime'] * 100\n",
    "            \n",
    "\n",
    "            df1 = pandas.DataFrame('', index=x.index, columns=x.columns)\n",
    "\n",
    "            df1['acc_task_time'] = m1.apply(lambda x: 'background-image: linear-gradient(to right,#5fba7d {:f}%,white {:f}%)'.format(x,x))\n",
    "            df1['run_time'] = m2.apply(lambda x: 'background-image: linear-gradient(to right,#5fba7d {:f}%,white {:f}%)'.format(x,x))\n",
    "            df1['f_wait_time'] = m3.apply(lambda x: 'background-image: linear-gradient(to right,#d65f5f {:f}%,white {:f}%)'.format(x,x))\n",
    "            return df1\n",
    "\n",
    "\n",
    "        cm = sns.light_palette(\"green\", as_cmap=True)\n",
    "        if plot:\n",
    "            display(out.style.apply(highlight_greater, axis=None).background_gradient(cmap=cm,subset=['input read', 'shuffle_write']))\n",
    "        \n",
    "        return out\n",
    "    \n",
    "    def get_query_time_metric(self):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "        querids=self.df.select(\"queryid\").distinct().collect()\n",
    "        for idx,q in enumerate([l[\"queryid\"] for l in querids]):\n",
    "            self.show_time_metric(query=[q,],showexecutor=False)\n",
    "            \n",
    "    def getOperatorCount(self):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "        df=spark.read.json(self.file)\n",
    "        queryids=self.df.select(F.col(\"queryid\").astype(LongType()),F.col(\"real_queryid\")).distinct().orderBy(\"real_queryid\")\n",
    "        queryplans=self.queryplans.collect()\n",
    "        list_queryid=[l.real_queryid for l in queryids.collect()]\n",
    "\n",
    "        def get_child(execid,node):\n",
    "            #wholestagetransformer not counted\n",
    "            if node['nodeName'] is not None and not node['nodeName'].startswith(\"WholeStageCodegenTransformer\"):\n",
    "                if node[\"nodeName\"] not in qps:\n",
    "                    qps[node[\"nodeName\"]]={l:0 for l in list_queryid}\n",
    "                qps[node[\"nodeName\"]][execid]=qps[node[\"nodeName\"]][execid]+1\n",
    "            if node[\"children\"] is not None:\n",
    "                for c in node[\"children\"]:\n",
    "                    get_child(execid,c)\n",
    "\n",
    "        qps={}\n",
    "        for c in queryplans:\n",
    "            get_child(c['real_queryid'],c)\n",
    "\n",
    "        return pandas.DataFrame(qps).T.sort_index(axis=0)        \n",
    "    \n",
    "    def get_query_plan(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        queryid=kwargs.get(\"queryid\",None)\n",
    "        stageid=kwargs.get(\"stageid\",None)\n",
    "        \n",
    "        outputstage=kwargs.get(\"outputstage\",None)\n",
    "        \n",
    "        show_plan_only=kwargs.get(\"show_plan_only\",False)\n",
    "        show_simple_string=kwargs.get(\"show_simple_string\",False)\n",
    "\n",
    "        plot=kwargs.get(\"plot\",True)\n",
    "        \n",
    "        colors=[\"#{:02x}{:02x}{:02x}\".format(int(l[0]*255),int(l[1]*255),int(l[2]*255)) for l in matplotlib.cm.get_cmap('tab20').colors]\n",
    "        \n",
    "        if queryid is not None:\n",
    "            if type(queryid)==int or type(queryid)==str:\n",
    "                queryid = [queryid,]\n",
    "            shown_stageid = [l[\"Stage ID\"] for l in self.df.where(F.col(\"real_queryid\").isin(queryid)).select(\"Stage ID\").distinct().collect()]\n",
    "        if stageid is not None:\n",
    "            if type(stageid)==int:\n",
    "                shown_stageid = [stageid,]\n",
    "            elif type(stageid)==list:\n",
    "                shown_stageid = stageid\n",
    "            queryid = [l[\"real_queryid\"] for l in self.df.where(F.col(\"`Stage ID`\").isin(shown_stageid)).select(\"real_queryid\").limit(1).collect()]\n",
    "\n",
    "\n",
    "        queryplans=[]\n",
    "        queryplans = self.queryplans.where(F.col(\"real_queryid\").isin(queryid)).orderBy(\"real_queryid\").collect() if queryid else self.queryplans.orderBy(\"real_queryid\").collect()\n",
    "        dfmetric=self.df.where(\"Event='SparkListenerTaskEnd'\").select(\"queryid\",\"real_queryid\",\"Stage ID\",\"Job ID\",F.explode(\"Accumulables\").alias(\"metric\")).select(\"*\",\"metric.*\").select(\"Stage ID\",\"ID\",\"Update\").groupBy(\"ID\",\"Stage ID\").agg(F.round(F.sum(\"Update\"),1).alias(\"value\"),F.round(F.stddev(\"Update\"),1).alias(\"stdev\")).collect()\n",
    "        accid2stageid={l.ID:(l[\"Stage ID\"],l[\"value\"],l[\"stdev\"]) for l in dfmetric}\n",
    "\n",
    "        stagetime=self.df.where((F.col(\"real_queryid\").isin(queryid))).where(F.col(\"Event\")=='SparkListenerTaskEnd').groupBy(\"Stage ID\").agg(\n",
    "            F.round(F.sum(F.col(\"Finish Time\")-F.col(\"Launch Time\"))/1000/self.executor_instances/self.executor_cores*self.taskcpus,1).alias(\"elapsed time\"),\n",
    "            F.round(F.stddev(F.col(\"Finish Time\")-F.col(\"Launch Time\"))/1000,1).alias(\"time stdev\"),\n",
    "            F.count(F.col(\"Task ID\")).alias(\"partitions\")\n",
    "            ).orderBy(F.desc(\"elapsed time\")).collect()\n",
    "\n",
    "        apptotaltime=reduce(lambda x,y: x+y['elapsed time'], stagetime,0)\n",
    "        if apptotaltime==0:\n",
    "            display(HTML(\"<font size=4 color=red>Error, totaltime is 0 </font>\"))\n",
    "            apptotaltime=1\n",
    "            return \"\"\n",
    "\n",
    "        stagemap={l[\"Stage ID\"]:l[\"elapsed time\"] for l in stagetime}\n",
    "        stage_time_stdev_map={l[\"Stage ID\"]:l[\"time stdev\"] for l in stagetime}\n",
    "        stagepartmap={l[\"Stage ID\"]:l[\"partitions\"] for l in stagetime}\n",
    "\n",
    "        keystage=[]\n",
    "        keystagetime=[]\n",
    "        subtotal=0\n",
    "        for s in stagetime:\n",
    "            subtotal=subtotal+s['elapsed time']\n",
    "            keystage.append(s['Stage ID'])\n",
    "            keystagetime.append(s['elapsed time'])\n",
    "            if subtotal/apptotaltime>0.9:\n",
    "                break\n",
    "        keystagetime=[\"{:02x}{:02x}\".format(int(255*l/keystagetime[0]),255-int(255*l/keystagetime[0])) for l in keystagetime if keystagetime[0]>0]\n",
    "        keystagemap=dict(zip(keystage,keystagetime))\n",
    "        outstr=[]\n",
    "        def print_plan(real_queryid,level,node,parent_stageid):\n",
    "            stageid = accid2stageid[int(node[\"metrics\"][0][\"accumulatorId\"])][0]  if node[\"metrics\"] is not None and len(node[\"metrics\"])>0 and node[\"metrics\"][0][\"accumulatorId\"] in accid2stageid else parent_stageid\n",
    "\n",
    "            if stageid in shown_stageid:\n",
    "                fontcolor=f\"color:#{keystagemap[stageid]}00;font-weight:bold\" if stageid in keystagemap else \"color:#000000\"\n",
    "                stagetime=0 if stageid not in stagemap else stagemap[stageid]\n",
    "                stageParts=0 if stageid not in stagepartmap else stagepartmap[stageid]\n",
    "\n",
    "                input_rowcntstr=\"\"\n",
    "                output_rowcntstr=\"\"\n",
    "                timename={}\n",
    "                input_columnarbatch=\"\"\n",
    "                output_columnarbatch=\"\"\n",
    "                output_row_batch=\"\"\n",
    "                other_metric_name={}\n",
    "\n",
    "                outputrows=0\n",
    "                outputbatches=0\n",
    "                if node[\"metrics\"] is not None:\n",
    "                    for m in node[\"metrics\"]:\n",
    "\n",
    "                        if m[\"accumulatorId\"] not in accid2stageid:\n",
    "                            continue\n",
    "                        \n",
    "                        if m[\"name\"].endswith(\"block wall nanos\") or m['name'].endswith(\"cpu nanos\"):\n",
    "                            continue\n",
    "                            \n",
    "                        \n",
    "                        value=accid2stageid[m[\"accumulatorId\"]][1]\n",
    "                        stdev_value=accid2stageid[m[\"accumulatorId\"]][2]\n",
    "                        stdev_value=0 if stdev_value is None else stdev_value\n",
    "                        if m[\"metricType\"] in ['nsTiming','timing']:\n",
    "                            totaltime=value/1000 if  m[\"metricType\"] == 'timing' else value/1000000000\n",
    "                            stdev_value=stdev_value/1000 if  m[\"metricType\"] == 'timing' else stdev_value/1000000000\n",
    "                            \n",
    "                            timeratio= 0  if stagetime==0 else totaltime/self.executor_instances/self.executor_cores*self.taskcpus/stagetime*100\n",
    "                            timeratio_query = totaltime/self.executor_instances/self.executor_cores*self.taskcpus/apptotaltime*100\n",
    "                            if timeratio > 10 or timeratio_query>10:\n",
    "                                timename[m[\"name\"]]=\"<font style='background-color:#ffff42'>{:.2f}s ({:.1f}%, {:.1f}%, {:.2f})</font>\".format(totaltime,timeratio, totaltime/self.executor_instances/self.executor_cores*self.taskcpus/apptotaltime*100,stdev_value)\n",
    "                            else:\n",
    "                                timename[m[\"name\"]]=\"{:.2f}s ({:.1f}%, {:.1f}%, {:.2f})\".format(totaltime,timeratio, totaltime/self.executor_instances/self.executor_cores*self.taskcpus/apptotaltime*100,stdev_value)\n",
    "                        elif m[\"name\"] in [\"number of output rows\",\"number of final output rows\"]:\n",
    "                            output_rowcntstr=\"{:,.1f}\".format(value/1000/1000)+\" M\"\n",
    "                            outputrows=value\n",
    "                        elif m[\"name\"] in [\"number of output columnar batches\",\"number of output batches\",\"output_batches\", \"number of output vectors\",\"number of final output vectors\", \"records read\"]: \n",
    "                            # records reads is the output of shuffle\n",
    "                            output_columnarbatch=\"{:,d}\".format(int(value))\n",
    "                            outputbatches=value\n",
    "                        elif m[\"name\"]==\"number of input rows\":\n",
    "                            input_rowcntstr=\"{:,.1f}\".format(value/1000/1000)+\" M\"\n",
    "                        elif m[\"name\"] in [\"number of input batches\",\"input_batches\",\"number of input vectors\"]:\n",
    "                            input_columnarbatch=\"{:,d}\".format(int(value))\n",
    "                        else:\n",
    "                            if value>1000000000:\n",
    "                                other_metric_name[m[\"name\"]]=\"{:,.1f} G ({:,.1f})\".format(value/1000000000,stdev_value/1000000000)\n",
    "                            elif value>1000000:\n",
    "                                other_metric_name[m[\"name\"]]=\"{:,.1f} M ({:,.1f})\".format(value/1000000,stdev_value/1000000)\n",
    "                            elif value>1000:\n",
    "                                other_metric_name[m[\"name\"]]=\"{:,.1f} K ({:,.1f})\".format(value/1000,stdev_value/1000)\n",
    "                            else:\n",
    "                                other_metric_name[m[\"name\"]]=\"{:,d} ({:,.1f})\".format(int(value),stdev_value)\n",
    "\n",
    "\n",
    "                if outputrows>0 and outputbatches>0:\n",
    "                    output_row_batch=\"{:,d}\".format(int(outputrows/outputbatches))\n",
    "\n",
    "\n",
    "                fontcolor=f\"color:#{keystagemap[stageid]}00;font-weight:bold\" if stageid in keystage else \"color:#000000\"\n",
    "                stagetime=0 if stageid not in stagemap else stagemap[stageid]\n",
    "                stage_time_stdev=0 if stageid not in stage_time_stdev_map else stage_time_stdev_map[stageid]\n",
    "                \n",
    "                nodenamestr=node[\"nodeName\"]\n",
    "                if nodenamestr is None:\n",
    "                    nodenamestr=\"\"\n",
    "                if nodenamestr in ['ColumnarToRow','RowToArrowColumnar','ArrowColumnarToRow','ArrowRowToColumnarExec','GlutenColumnarToRowExec','GlutenRowToArrowColumnar']:\n",
    "                    nodename='<span style=\"color: green; background-color: #ffff42\">'+nodenamestr+'</span>'\n",
    "                else:\n",
    "                    nodename=nodenamestr\n",
    "                if outputstage is not None:\n",
    "                    outputstage.append({\"queryid\":real_queryid,\"stageid\":stageid,\"stagetime\":stagetime,\"stageParts\":stageParts,\"nodename\":nodenamestr,\"output_rowcnt\":outputrows,\"nodename_level\":\" \".join([\"|_\" for l in range(0,level)]) + \" \" + nodenamestr})\n",
    "                if not show_plan_only:\n",
    "                    nodestr= \" \".join([\"|_\" for l in range(0,level)]) + \" \" + nodename\n",
    "                    if show_simple_string :\n",
    "                        simstr=node['simpleString']\n",
    "                        nodestr = nodestr + \"<br>\\n\" +  simstr                                                                 \n",
    "                    \n",
    "                    timenametable='<table  style=\"width:100%\">\\n'\n",
    "                    \n",
    "                    timenameSort=list(timename)\n",
    "                    \n",
    "                    for nameidx in sorted(timename):\n",
    "                        timenametable+=f\"<tr><td>{nameidx}</td><td>{timename[nameidx]}</td></tr>\"\n",
    "                    timenametable+=\"</table>\\n\"\n",
    "                    \n",
    "                    \n",
    "                    othertable='<table style=\"width:100%\">\\n'\n",
    "                    for nameidx in sorted(other_metric_name):\n",
    "                        othertable+=f\"<tr><td>{nameidx}</td><td>{other_metric_name[nameidx]}</td></tr>\"\n",
    "                    othertable+=\"</table>\\n\"\n",
    "                    \n",
    "                    outstr.append(f\"<tr><td style='{fontcolor}'>{stageid}</td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {stagetime}({stage_time_stdev}) </td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {stageParts} </td>\"+\n",
    "                                  f\"<td style='text-align:left; background-color:{colors[stageid % 20]}'>\" + nodestr + f\"</td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {input_rowcntstr} </td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {input_columnarbatch} </td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {output_rowcntstr} </td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {output_columnarbatch} </td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {output_row_batch} </td>\"+\n",
    "                                  f\"<td style='{fontcolor}' colspan=2> {timenametable} </td>\"+\n",
    "                                  f\"<td style='{fontcolor}' colspan=2> {othertable} </td>\"+\n",
    "                                  \"</tr>\")\n",
    "                else:\n",
    "                    outstr.append(f\"<tr><td style='{fontcolor}'>{stageid}</td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {stagetime} </td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {stageParts} </td>\"+\n",
    "                                  f\"<td style='text-align:left; background-color:{colors[stageid % 20]}'>\" + \" \".join([\"|_\" for l in range(0,level)]) + \" \" + nodename + f\"</td>\"+\n",
    "                                  f\"<td style='{fontcolor}'> {output_rowcntstr} </td></tr>\")\n",
    "                    \n",
    "            if node[\"children\"] is not None:\n",
    "                for c in node[\"children\"]:\n",
    "                    print_plan(real_queryid, level+1,c,stageid)\n",
    "\n",
    "        for c in queryplans:\n",
    "            outstr.append(\"<font color=red size=4>\"+str(c['real_queryid'])+\"</font><table>\")\n",
    "            if not show_plan_only:\n",
    "                outstr.append('''<tr>\n",
    "                                    <td>stage id</td>\n",
    "                                    <td>stage time</td>\n",
    "                                    <td>partions</td>\n",
    "                                    <td>operator</td>\n",
    "                                    <td>input rows</td>\n",
    "                                    <td>input batches</td>\n",
    "                                    <td>output rows</td>\n",
    "                                    <td>output batches</td>\n",
    "                                    <td>output rows/batch</td>\n",
    "                                    <td width=150>time metric name</td>\n",
    "                                    <td width=200>time(%stage,%total,stdev)</td>\n",
    "                                    <td width=150>other metric name</td>\n",
    "                                    <td width=130>value(stdev)</td>\n",
    "                                </tr>''')\n",
    "            else:\n",
    "                outstr.append('''<tr>\n",
    "                                    <td>stage id</td>\n",
    "                                    <td>stage time</td>\n",
    "                                    <td>partions</td>\n",
    "                                    <td>operator</td>\n",
    "                                    <td>output rows</td>\n",
    "                                </tr>''')\n",
    "\n",
    "            print_plan(c['real_queryid'],0,c,0)\n",
    "            outstr.append(\"</table>\")\n",
    "        if plot:\n",
    "            display(HTML(\" \".join(outstr)))\n",
    "        return \" \".join(outstr)\n",
    "    \n",
    "    def get_metric_output_rowcnt(self, **kwargs):\n",
    "        return self.get_metric_rowcnt(\"number of output rows\",**kwargs)\n",
    "        \n",
    "    def get_metric_input_rowcnt(self, **kwargs):\n",
    "        return self.get_metric_rowcnt(\"number of input rows\",**kwargs)\n",
    "        \n",
    "    def get_metric_rowcnt(self,rowname, **kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        queryid=kwargs.get(\"queryid\",None)\n",
    "        stageid=kwargs.get(\"stageid\",None)\n",
    "        show_task=kwargs.get(\"show_task\",False)\n",
    "        \n",
    "        if queryid and type(queryid)==int:\n",
    "            queryid = [queryid,]\n",
    "            \n",
    "        if stageid and type(stageid)==int:\n",
    "            stageid = [stageid,]\n",
    "            \n",
    "        queryplans = self.queryplans.where(F.col(\"real_queryid\").isin(queryid)).orderBy(\"real_queryid\").collect() if queryid else self.queryplans.orderBy(\"real_queryid\").collect()\n",
    "        qps=[]\n",
    "\n",
    "        rownames=rowname if type(rowname)==list else [rowname,]\n",
    "        def get_child(execid,node):\n",
    "            if node['metrics'] is not None:\n",
    "                outputrows=[x for x in node[\"metrics\"] if \"name\" in x and x[\"name\"] in rownames]\n",
    "                if len(outputrows)>0:\n",
    "                    qps.append([node[\"nodeName\"],execid,outputrows[0]['accumulatorId']])\n",
    "            if node[\"children\"] is not None:\n",
    "                for c in node[\"children\"]:\n",
    "                    get_child(execid,c)\n",
    "        for c in queryplans:\n",
    "            get_child(c['real_queryid'],c)\n",
    "\n",
    "        if len(qps)==0:\n",
    "            print(\"Metric \",rowname,\" is not found. \")\n",
    "            return None\n",
    "        stagetime=self.df.where(\"Event='SparkListenerTaskEnd'\").groupBy(\"Stage ID\").agg(F.round(F.sum(F.col(\"Finish Time\")-F.col(\"Launch Time\"))/1000/self.executor_instances/self.executor_cores*self.taskcpus,2).alias(\"stage time\"))\n",
    "        dfmetric=self.df.where(\"Event='SparkListenerTaskEnd'\").select(\"queryid\",\"real_queryid\",\"Stage ID\",\"Job ID\",F.explode(\"Accumulables\").alias(\"metric\")).select(\"*\",\"metric.*\").drop(\"metric\")\n",
    "        numrowmetric=spark.createDataFrame(qps)\n",
    "        numrowmetric=numrowmetric.withColumnRenamed(\"_1\",\"metric\").withColumnRenamed(\"_2\",\"real_queryid\").withColumnRenamed(\"_3\",\"metricid\")\n",
    "        dfmetric_rowcnt=dfmetric.join(numrowmetric.drop(\"real_queryid\"),on=[F.col(\"metricid\")==F.col(\"ID\")],how=\"right\")\n",
    "        if show_task:\n",
    "            stagemetric=dfmetric_rowcnt.join(stagetime,\"Stage ID\")\n",
    "        else:\n",
    "            stagemetric=dfmetric_rowcnt.groupBy(\"queryid\",\"real_queryid\",\"Job ID\",\"Stage ID\",\"metricid\").agg(F.round(F.sum(\"Update\")/1000000,2).alias(\"total_row\"),F.max(\"metric\").alias(\"nodename\")).join(stagetime,\"Stage ID\")\n",
    "\n",
    "        if queryid:\n",
    "            if stageid:\n",
    "                return stagemetric.where(F.col(\"real_queryid\").isin(queryid) & F.col(\"Stage ID\").isin(stageid)).orderBy(\"Stage ID\")\n",
    "            else:\n",
    "                return stagemetric.where(F.col(\"real_queryid\").isin(queryid)).orderBy(\"Stage ID\")\n",
    "        else:\n",
    "            noderow=stagemetric.groupBy(\"real_queryid\",\"nodename\").agg(F.round(F.sum(\"total_row\"),2).alias(\"total_row\")).orderBy(\"nodename\").collect()\n",
    "            out={}\n",
    "            qids=set([r.real_queryid for r in noderow])\n",
    "            for r in noderow:\n",
    "                if r.nodename not in out:\n",
    "                    out[r.nodename]={c:0 for c in qids}\n",
    "                out[r.nodename][r.real_queryid]=r.total_row\n",
    "            return pandas.DataFrame(out).T.sort_index(axis=0)\n",
    "    \n",
    "    def get_query_info(self,queryid):\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> time stat info </b></font>\",))\n",
    "        tmp=self.get_query_time(queryid=queryid)\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> stage stat info </b></font>\",))\n",
    "        display(self.get_stage_stat(queryid=queryid))\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> query plan </b></font>\",))\n",
    "        self.get_query_plan(queryid=queryid)\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> stage hist info </b></font>\",))\n",
    "        self.show_Stages_hist(queryid=queryid)\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> time info </b></font>\",))\n",
    "        display(self.show_time_metric(queryid=queryid))\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> operator and rowcount </b></font>\",))\n",
    "        display(self.get_metric_input_rowcnt(queryid=queryid))\n",
    "        display(self.get_metric_output_rowcnt(queryid=queryid))\n",
    "        \n",
    "    def get_app_info(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        display(HTML(f\"<font color=red size=7 face='Courier New'><b> {self.appid} </b></font>\",))\n",
    "        display(HTML(f\"<a href=http://{localhost}:18080/history/{self.appid}>http://{localhost}:18080/history/{self.appid}</a>\"))\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> query time </b></font>\",))\n",
    "        tmp=self.get_query_time(**kwargs)\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> operator count </b></font>\",))\n",
    "        pdf=self.getOperatorCount()\n",
    "        display(pdf.style.apply(background_gradient,\n",
    "               cmap='OrRd',\n",
    "               m=pdf.min().min(),\n",
    "               M=pdf.max().max(),\n",
    "               low=0,\n",
    "               high=1))\n",
    "        \n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> operator input row count </b></font>\",))\n",
    "        pdf=self.get_metric_input_rowcnt(**kwargs)\n",
    "        if pdf is not None:\n",
    "            display(pdf.style.apply(background_gradient,\n",
    "                   cmap='OrRd',\n",
    "                   m=pdf.min().min(),\n",
    "                   M=pdf.max().max(),\n",
    "                   low=0,\n",
    "                   high=1))\n",
    "        display(HTML(\"<font color=red size=7 face='Courier New'><b> operator output row count </b></font>\",))\n",
    "        pdf=self.get_metric_output_rowcnt(**kwargs)\n",
    "        if pdf is not None:\n",
    "            display(pdf.style.apply(background_gradient,\n",
    "                   cmap='OrRd',\n",
    "                   m=pdf.min().min(),\n",
    "                   M=pdf.max().max(),\n",
    "                   low=0,\n",
    "                   high=1))\n",
    "        self.show_time_metric(**kwargs)\n",
    "        \n",
    "    def get_stage_stat(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        queryid=kwargs.get(\"queryid\",None)\n",
    "\n",
    "        if queryid and type(queryid)==int:\n",
    "            queryid = [queryid,]\n",
    "            \n",
    "        df=self.df.where(F.col(\"real_queryid\").isin(queryid)).where(F.col(\"Event\")=='SparkListenerTaskEnd')\n",
    "        \n",
    "        inputsize = df.select(\"real_queryid\",\"Stage ID\",\"Executor ID\", \"Task ID\", F.explode(\"Accumulables\")) \\\n",
    "                      .select(\"real_queryid\",\"Stage ID\",\"Executor ID\", \"Task ID\",\"col.*\") \\\n",
    "                      .where(\"Name='input size in bytes' or Name='size of files read'\") \\\n",
    "                      .groupBy(\"Stage ID\") \\\n",
    "                      .agg(F.round(F.sum(\"Update\")/1024/1024/1024,2).alias(\"input read\"))\n",
    "        \n",
    "        return df.groupBy(\"Job ID\",\"Stage ID\").agg(\n",
    "            F.round(F.sum(F.col(\"Finish Time\")-F.col(\"Launch Time\"))/1000/self.executor_instances/self.executor_cores*self.taskcpus,1).alias(\"elapsed time\"),\n",
    "            F.round(F.sum(F.col(\"Disk Bytes Spilled\"))/1024/1024/1024,1).alias(\"disk spilled\"),\n",
    "            F.round(F.sum(F.col(\"Memory Bytes Spilled\"))/1024/1024/1024,1).alias(\"mem spilled\"),\n",
    "            F.round(F.sum(F.col(\"Local Bytes Read\"))/1024/1024/1024,1).alias(\"local read\"),\n",
    "            F.round(F.sum(F.col(\"Remote Bytes Read\"))/1024/1024/1024,1).alias(\"remote read\"),\n",
    "            F.round(F.sum(F.col(\"Shuffle Bytes Written\"))/1024/1024/1024,1).alias(\"shuffle write\"),\n",
    "            F.round(F.sum(F.col(\"Executor Deserialize Time\"))/1000,1).alias(\"deseri time\"),\n",
    "            F.round(F.sum(F.col(\"Fetch Wait Time\"))/1000,1).alias(\"fetch wait time\"),\n",
    "            F.round(F.sum(F.col(\"Shuffle Write Time\"))/1000000000,1).alias(\"shuffle write time\"),\n",
    "            F.round(F.sum(F.col(\"Result Serialization Time\"))/1000,1).alias(\"seri time\"),\n",
    "            F.round(F.sum(F.col(\"Getting Result Time\"))/1000,1).alias(\"get result time\"),\n",
    "            F.round(F.sum(F.col(\"JVM GC Time\"))/1000,1).alias(\"gc time\"),\n",
    "            F.round(F.sum(F.col(\"Executor CPU Time\"))/1000000000,1).alias(\"exe cpu time\")    \n",
    "            ).join(inputsize,on=[\"Stage ID\"],how=\"left\").orderBy(\"Stage ID\").toPandas()\n",
    "    \n",
    "    def get_metrics_by_node(self,node_name):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "        \n",
    "        if type(node_name)==str:\n",
    "            node_name=[node_name]\n",
    "        metrics=self.queryplans.collect()\n",
    "        coalesce=[]\n",
    "        metricsid=[0]\n",
    "        def get_metric(root):\n",
    "            if root['nodeName'] in node_name:\n",
    "                metricsid[0]=metricsid[0]+1\n",
    "                for l in root[\"metrics\"]:\n",
    "                    coalesce.append([l['accumulatorId'],l[\"metricType\"],l['name'],root[\"nodeName\"],metricsid[0]])\n",
    "            if root[\"children\"] is not None:\n",
    "                for c in root[\"children\"]:\n",
    "                    get_metric(c)\n",
    "        for c in metrics:\n",
    "            get_metric(c)\n",
    "\n",
    "        df=self.df.select(\"queryid\",\"real_queryid\",'Stage ID','Task ID','Job ID',F.explode(\"Accumulables\"))\n",
    "        df=df.select(\"*\",\"col.*\")\n",
    "        metricdf=spark.createDataFrame(coalesce)\n",
    "        metricdf=metricdf.withColumnRenamed(\"_1\",\"ID\").withColumnRenamed(\"_2\",\"Unit\").withColumnRenamed(\"_3\",\"metricName\").withColumnRenamed(\"_4\",\"nodeName\").withColumnRenamed(\"_5\",\"nodeID\")\n",
    "        df=df.join(metricdf,on=[\"ID\"],how=\"right\")\n",
    "        shufflemetric=set(l[2] for l in coalesce)\n",
    "        metricdfs=[df.where(F.col(\"Name\")==l).groupBy(\"real_queryid\",\"nodeID\",\"Stage ID\").agg(F.stddev(\"Update\").alias(l+\"_stddev\"),F.mean(\"Update\").alias(l+\"_mean\"),F.mean(\"Update\").alias(l) if l.startswith(\"avg\") else F.sum(\"Update\").alias(l)) for l in shufflemetric]\n",
    "        \n",
    "        stagetimedf=self.df.where(\"Event='SparkListenerTaskEnd'\").groupBy(\"Stage ID\").agg(F.count(\"*\").alias(\"partnum\"),F.round(F.sum(F.col(\"Finish Time\")-F.col(\"Launch Time\"))/1000,2).alias(\"ElapsedTime\"))\n",
    "        \n",
    "        nodemetric=reduce(lambda x,y: x.join(y, on=['nodeID',\"Stage ID\",\"real_queryid\"],how=\"full\"),metricdfs)\n",
    "        return nodemetric.join(stagetimedf,on=\"Stage ID\")\n",
    "    \n",
    "    \n",
    "    def get_coalesce_batch_row_cnt(self,**kwargs):\n",
    "        stagesum=self.get_metrics_by_node(\"CoalesceBatches\")\n",
    "        \n",
    "        pandas.options.display.float_format = '{:,}'.format\n",
    "        \n",
    "        stagesum=stagesum.withColumnRenamed(\"number of output rows\",\"rows\")\n",
    "        \n",
    "        coalescedf = stagesum.orderBy(\"real_queryid\",'Stage ID').where(\"rows>4000\").toPandas()\n",
    "        \n",
    "        coalescedf[\"row/input_batch\"] = coalescedf[\"rows\"]/coalescedf[\"input_batches\"]\n",
    "        coalescedf[\"row/out_batch\"] = coalescedf[\"rows\"]/coalescedf[\"output_batches\"]\n",
    "        coalescedf['stage']=coalescedf[\"real_queryid\"].astype(str)+\"_\"+coalescedf['Stage ID'].astype(str)\n",
    "        \n",
    "        ax=coalescedf.plot(y=[\"row/input_batch\",\"row/out_batch\"],figsize=(30,8),style=\"-*\")\n",
    "        coalescedf.plot(ax=ax,y=['rows'],secondary_y=['rows'],style=\"k_\")\n",
    "        self.print_real_queryid(ax,coalescedf)\n",
    "        \n",
    "        return coalescedf\n",
    "    \n",
    "    def print_real_queryid(self,ax,dataset):\n",
    "        ax.axes.get_xaxis().set_ticks([])\n",
    "\n",
    "        ymin, ymax = ax.get_ybound()\n",
    "\n",
    "        real_queryid=list(dataset['real_queryid'])\n",
    "        s=real_queryid[0]\n",
    "        lastx=0\n",
    "        for idx,v in enumerate(real_queryid):\n",
    "            if v!=s:\n",
    "                xmin = xmax = idx-1+0.5\n",
    "                l = mlines.Line2D([xmin,xmax], [ymin,ymax],color=\"green\")\n",
    "                ax.add_line(l)\n",
    "                ax.text(lastx+(xmin-lastx)/2-0.25,ymin-(ymax-ymin)/20,f\"{s}\",size=20)\n",
    "                s=v\n",
    "                lastx=xmin\n",
    "\n",
    "    def get_shuffle_stat(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "            \n",
    "        shufflesize=kwargs.get(\"shuffle_size\",1000000)\n",
    "        queryid=kwargs.get(\"queryid\",None)\n",
    "        if queryid is not None:\n",
    "            if type(queryid) is str or type(queryid) is int:\n",
    "                queryid=[queryid,]\n",
    "\n",
    "        exchangedf=self.get_metrics_by_node([\"ColumnarExchange\",\"ColumnarExchangeAdaptor\"])\n",
    "        exchangedf.cache()\n",
    "        if exchangedf.count() == 0:\n",
    "            return (None, None)\n",
    "\n",
    "        mapdf=exchangedf.where(\"`time to split` is not null\").select(\"nodeID\",F.col(\"Stage ID\").alias(\"map_stageid\"),\"real_queryid\",F.floor(F.col(\"time to split\")/F.col(\"time to split_mean\")).alias(\"map_partnum\"),\"time to compress\",\"time to split\",\"shuffle write time\",\"time to spill\",'shuffle records written','data size','shuffle bytes written','shuffle bytes written_mean','shuffle bytes written_stddev','shuffle bytes spilled','number of input rows','number of input batches')\n",
    "        reducerdf=exchangedf.where(\"`time to split` is null\").select(\"nodeID\",F.col(\"Stage ID\").alias(\"reducer_stageid\"),\"real_queryid\",'local blocks read','local bytes read',F.floor(F.col(\"records read\")/F.col(\"records read_mean\")).alias(\"reducer_partnum\"),(F.col('avg read batch num rows')/10).alias(\"avg read batch num rows\"),'remote bytes read','records read','remote blocks read',(F.col(\"number of output rows\")/F.col(\"records read\")).alias(\"avg rows per split recordbatch\"))\n",
    "        shuffledf=mapdf.join(reducerdf,on=[\"nodeID\",\"real_queryid\"],how=\"full\")\n",
    "        if queryid is not None:\n",
    "            shuffledf=shuffledf.where(F.col(\"real_queryid\").isin(queryid))\n",
    "        shuffle_pdf=shuffledf.where(\"`shuffle bytes written`>1000000\").orderBy(\"real_queryid\",\"map_stageid\",\"nodeID\").toPandas()\n",
    "        if shuffle_pdf.shape[0] == 0:\n",
    "            return (shuffledf, None)\n",
    "\n",
    "        shuffle_pdf[\"shuffle bytes written\"]=shuffle_pdf[\"shuffle bytes written\"]/1000000000\n",
    "        shuffle_pdf[\"data size\"]=shuffle_pdf[\"data size\"]/1000000000\n",
    "        shuffle_pdf[\"shuffle bytes written_mean\"]=shuffle_pdf[\"shuffle bytes written_mean\"]/1000000\n",
    "        shuffle_pdf[\"shuffle bytes written_stddev\"]=shuffle_pdf[\"shuffle bytes written_stddev\"]/1000000\n",
    "        ax=shuffle_pdf.plot(y=[\"avg read batch num rows\",'avg rows per split recordbatch'],figsize=(30,8),style=\"-*\",title=\"average batch size after split\")\n",
    "        self.print_real_queryid(ax,shuffle_pdf)\n",
    "        shuffle_pdf[\"split_ratio\"]=shuffle_pdf[\"records read\"]/shuffle_pdf['number of input batches']\n",
    "        ax=shuffle_pdf.plot(y=[\"split_ratio\",\"records read\"],secondary_y=[\"records read\"],figsize=(30,8),style=\"-*\",title=\"Split Ratio\")\n",
    "        self.print_real_queryid(ax,shuffle_pdf)\n",
    "        shuffle_pdf[\"compress_ratio\"]=shuffle_pdf[\"data size\"]/shuffle_pdf['shuffle bytes written']\n",
    "        ax=shuffle_pdf.plot(y=[\"shuffle bytes written\",\"compress_ratio\"],secondary_y=[\"compress_ratio\"],figsize=(30,8),style=\"-*\",title=\"compress ratio\")\n",
    "        self.print_real_queryid(ax,shuffle_pdf)\n",
    "        shufflewritepdf=shuffle_pdf\n",
    "        ax=shufflewritepdf.plot.bar(y=[\"shuffle write time\",\"time to spill\",\"time to compress\",\"time to split\"],stacked=True,figsize=(30,8),title=\"split time + shuffle write time vs. shuffle bytes written\")\n",
    "        ax=shufflewritepdf.plot(ax=ax,y=[\"shuffle bytes written\"],secondary_y=[\"shuffle bytes written\"],style=\"-*\")\n",
    "        self.print_real_queryid(ax,shufflewritepdf)\n",
    "        shuffle_pdf['avg input batch size']=shuffle_pdf[\"number of input rows\"]/shuffle_pdf[\"number of input batches\"]\n",
    "        ax=shuffle_pdf.plot(y=[\"avg input batch size\"],figsize=(30,8),style=\"b-*\",title=\"average input batch size\")\n",
    "        ax=shuffle_pdf.plot.bar(ax=ax,y=['number of input rows'],secondary_y=True)\n",
    "        self.print_real_queryid(ax,shuffle_pdf)\n",
    "        \n",
    "        metrics=self.queryplans.collect()\n",
    "        coalesce=[]\n",
    "        metricsid=[0]\n",
    "        def get_metric(root):\n",
    "            if root['nodeName'] in [\"ColumnarExchange\",\"ColumnarExchangeAdaptor\"]:\n",
    "                metricsid[0]=metricsid[0]+1\n",
    "                for l in root[\"metrics\"]:\n",
    "                    coalesce.append([l['accumulatorId'],l[\"metricType\"],l['name'],root[\"nodeName\"],metricsid[0],root[\"simpleString\"]])\n",
    "            if root[\"children\"] is not None:\n",
    "                for c in root[\"children\"]:\n",
    "                    get_metric(c)\n",
    "        for c in metrics:\n",
    "            get_metric(c)\n",
    "\n",
    "        tps={}\n",
    "        for r in coalesce:\n",
    "            rx=re.search(r\"\\[OUTPUT\\] List\\((.*)\\)\",r[5])\n",
    "            if rx:\n",
    "                if r[4] not in tps:\n",
    "                    tps[r[4]]={}\n",
    "                    fds=rx.group(1).split(\", \")\n",
    "                    for f in fds:\n",
    "                        if f.endswith(\"Type\"):\n",
    "                            tp=re.search(r\":(.+Type)\",f).group(1)\n",
    "                            if tp not in tps[r[4]]:\n",
    "                                tps[r[4]][tp]=1\n",
    "                            else:\n",
    "                                tps[r[4]][tp]+=1\n",
    "        if len(tps)>0:\n",
    "            typedf=pandas.DataFrame(tps).T.reset_index()\n",
    "            typedf=typedf.fillna(0)\n",
    "            shuffle_pdf=pandas.merge(shuffle_pdf,typedf,left_on=\"nodeID\",right_on=\"index\")\n",
    "            shufflewritepdf=shuffle_pdf\n",
    "            ax=shufflewritepdf.plot.bar(y=[\"number of input rows\"],stacked=True,figsize=(30,8),title=\"rows vs. shuffle data type\")\n",
    "            ax=shufflewritepdf.plot(ax=ax,y=list(typedf.columns[1:]),secondary_y=list(typedf.columns[1:]),style=\"-o\")\n",
    "            self.print_real_queryid(ax,shufflewritepdf)\n",
    "            ax=shufflewritepdf.plot.bar(y=[\"time to split\"],stacked=True,figsize=(30,8),title=\"split time vs. shuffle data type\")\n",
    "            ax=shufflewritepdf.plot(ax=ax,y=list(typedf.columns[1:]),secondary_y=list(typedf.columns[1:]),style=\"-o\")\n",
    "            self.print_real_queryid(ax,shufflewritepdf)\n",
    "\n",
    "        \n",
    "        \n",
    "        shufflewritepdf.plot(x=\"shuffle bytes written\",y=[\"shuffle write time\",\"time to split\"],figsize=(30,8),style=\"*\")\n",
    "        shufflewritepdf[\"avg shuffle batch size after split\"]=shufflewritepdf[\"shuffle bytes written\"]*1000000/shufflewritepdf['records read']\n",
    "        shufflewritepdf[\"avg raw batch size after split\"]=shufflewritepdf[\"data size\"]*1000000/shufflewritepdf['records read']\n",
    "        ax=shufflewritepdf.plot(y=[\"avg shuffle batch size after split\",\"avg raw batch size after split\",\"shuffle bytes written\"],secondary_y=[\"shuffle bytes written\"],figsize=(30,8),style=\"-*\",title=\"avg batch KB after split\")\n",
    "        self.print_real_queryid(ax,shufflewritepdf)\n",
    "        shufflewritepdf[\"avg batch# per splitted partition\"]=shufflewritepdf['records read']/(shufflewritepdf['local blocks read']+shufflewritepdf['remote blocks read'])\n",
    "        ax=shufflewritepdf.plot(y=[\"avg batch# per splitted partition\",'records read'],secondary_y=['records read'],figsize=(30,8),style=\"-*\",title=\"avg batch# per splitted partition\")\n",
    "        self.print_real_queryid(ax,shufflewritepdf)\n",
    "        fig, ax = plt.subplots(figsize=(30,8))\n",
    "        ax.set_title('shuffle wite bytes with stddev')\n",
    "        ax.errorbar(x=shuffle_pdf.index,y=shuffle_pdf['shuffle bytes written_mean'], yerr=shuffle_pdf['shuffle bytes written_stddev'], linestyle='None', marker='o')\n",
    "        self.print_real_queryid(ax,shuffle_pdf)\n",
    "        shuffle_pdf['record batch per mapper per reducer']=shuffle_pdf['records read']/(shuffle_pdf[\"map_partnum\"]*shuffle_pdf['reducer_partnum'])\n",
    "        ax=shuffle_pdf.plot(y=[\"record batch per mapper per reducer\"],figsize=(30,8),style=\"b-*\",title=\"record batch per mapper per reducer\")\n",
    "        self.print_real_queryid(ax,shuffle_pdf)\n",
    "        \n",
    "        inputsize = self.df.select(\"Stage ID\",\"Executor ID\", \"Task ID\", F.explode(\"Accumulables\")) \\\n",
    "              .select(\"Stage ID\",\"Executor ID\", \"Task ID\",\"col.*\") \\\n",
    "              .where(\"Name='input size in bytes' or Name='size of files read'\") \\\n",
    "              .groupBy(\"Task ID\") \\\n",
    "              .agg((F.sum(\"Update\")).alias(\"input read\"))\n",
    "        stageinput=self.df.where(\"event='SparkListenerTaskEnd'\" )\\\n",
    "                                .join(inputsize,on=[\"Task ID\"],how=\"left\")\\\n",
    "                                .fillna(0) \\\n",
    "                                .select(F.col('Host'), F.col(\"real_queryid\"),F.col('Stage ID'),F.col('Task ID'),\n",
    "                                        F.round((F.col('Finish Time')/1000-F.col('Launch Time')/1000),2).alias('elapsedtime'),\n",
    "                                        F.round((F.col('`input read`')+F.col('`Bytes Read`')+F.col('`Local Bytes Read`')+F.col('`Remote Bytes Read`'))/1024/1024,2).alias('input'))\n",
    "        baisstage=stageinput.groupBy(\"real_queryid\",\"Stage ID\").agg(F.mean(\"elapsedtime\").alias(\"elapsed\"),F.mean(\"input\").alias(\"input\"),\n",
    "                                                            (F.stddev(\"elapsedtime\")).alias(\"elapsedtime_err\"),\n",
    "                                                            (F.stddev(\"input\")).alias(\"input_err\"),\n",
    "                                                            (F.max(\"elapsedtime\")-F.mean(\"elapsedtime\")).alias(\"elapsed_max\"),\n",
    "                                                            (F.mean(\"elapsedtime\")-F.min(\"elapsedtime\")).alias(\"elapsed_min\"),\n",
    "                                                            (F.max(\"input\")-F.mean(\"input\")).alias(\"input_max\"),\n",
    "                                                            (F.mean(\"input\")-F.min(\"input\")).alias(\"input_min\")).orderBy(\"real_queryid\",\"Stage ID\")\n",
    "        dfx=baisstage.toPandas()\n",
    "        fig, ax = plt.subplots(figsize=(30,8))\n",
    "        ax.set_title('input size')\n",
    "        ax.errorbar(x=dfx.index,y=dfx['input'], yerr=dfx['input_err'], fmt='ok', ecolor='red', lw=3)\n",
    "        ax.errorbar(x=dfx.index,y=dfx['input'],yerr=[dfx['input_min'],dfx['input_max']],\n",
    "                     fmt='.k', ecolor='gray', lw=1)\n",
    "        self.print_real_queryid(ax,dfx)\n",
    "        \n",
    "        fig, ax = plt.subplots(figsize=(30,8))\n",
    "        ax.set_title('stage time')\n",
    "\n",
    "        ax.errorbar(x=dfx.index,y=dfx['elapsed'], yerr=dfx['elapsedtime_err'], fmt='ok', ecolor='red', lw=5)\n",
    "        ax.errorbar(x=dfx.index,y=dfx['elapsed'],yerr=[dfx['elapsed_min'],dfx['elapsed_max']],\n",
    "                     fmt='.k', ecolor='gray', lw=1)\n",
    "\n",
    "        self.print_real_queryid(ax,dfx)\n",
    "        return (shuffle_pdf,dfx)\n",
    "    \n",
    "    def get_stages_w_odd_partitions(appals,**kwargs):\n",
    "        if appals.df is None:\n",
    "            appals.load_data()\n",
    "        return appals.df.where(\"Event='SparkListenerTaskEnd'\")\\\n",
    "                    .groupBy(\"Stage ID\",\"real_queryid\")\\\n",
    "                    .agg((F.sum(F.col('Finish Time')-F.col('Launch Time'))/1000).alias(\"elapsed time\"),\n",
    "                         F.count('*').alias('partitions'))\\\n",
    "                    .where(F.col(\"partitions\")%(appals.executor_cores*appals.executor_instances/appals.taskcpus)!=0)\\\n",
    "                    .orderBy(F.desc(\"elapsed time\")).toPandas()\n",
    "   \n",
    "    def get_scaned_column_v1(appals):\n",
    "        def get_scans(node):\n",
    "            if node['nodeName'].startswith(\"Scan arrow\"):\n",
    "                scans.append(node)\n",
    "            for c in node['children']:\n",
    "                get_scans(c)\n",
    "\n",
    "        alltable=[]\n",
    "        for qid in range(1,23):\n",
    "            scans=[]\n",
    "            plans=appals.queryplans.where(\"real_queryid=\"+str(qid)).collect()\n",
    "            get_scans(plans[0])\n",
    "            for s in scans:\n",
    "                alltable.append([qid,\",\".join([l.split(\":\")[0] for l in re.split(r'[<>]',s['metadata']['ReadSchema'])[1].split(\",\")])])\n",
    "        return alltable\n",
    "    \n",
    "    def get_scaned_column_v2(appals):\n",
    "        def get_scans(node):\n",
    "            if node['nodeName'].startswith(\"ColumnarBatchScan\"):\n",
    "                scans.append(node)\n",
    "            for c in node['children']:\n",
    "                get_scans(c)\n",
    "\n",
    "        alltable=[]\n",
    "        for qid in range(1,23):\n",
    "            scans=[]\n",
    "            plans=appals.queryplans.where(\"real_queryid=\"+str(qid)).collect()\n",
    "            get_scans(plans[0])\n",
    "            for s in scans:\n",
    "                alltable.append([qid,\",\".join([l.split(\"#\")[0] for l in re.split(r\"[\\[\\]]\",s['simpleString'])[1].split(\",\")])])\n",
    "        return alltable\n",
    "    \n",
    "    def compare_query(appals,queryid,appbaseals):\n",
    "        print(f\"~~~~~~~~~~~~~~~~~~~~~~~~~~~~Query{queryid}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")\n",
    "        appals.show_critical_path_time_breakdown(queryid=22)\n",
    "        s1=appals.get_stage_stat(queryid=queryid)\n",
    "        s2=appbaseals.get_stage_stat(queryid=queryid)\n",
    "        ls=s1[['Stage ID','elapsed time']]\n",
    "        ls.columns=['l sid','l time']\n",
    "        rs=s2[['Stage ID','elapsed time']]\n",
    "        rs.columns=['r sid','r time']\n",
    "        js=ls.join(rs)\n",
    "        js['gap']=js['r time'] - js['l time']\n",
    "        js['gap']=js['gap'].round(2)\n",
    "        display(js)\n",
    "        display(s1)\n",
    "        display(s2)\n",
    "        stagesmap={}\n",
    "        for x in range(0,min(len(s1),len(s2))):\n",
    "            stagesmap[s1['Stage ID'][x]]=s2['Stage ID'][x]\n",
    "        totaltime=sum(s1['elapsed time'])\n",
    "        acctime=0\n",
    "        s1time=s1.sort_values(\"elapsed time\",ascending=False,ignore_index=True)\n",
    "        ldfx=appals.get_metric_output_rowcnt(queryid=queryid)\n",
    "        rdfx=appbaseals.get_metric_output_rowcnt(queryid=queryid)\n",
    "\n",
    "        for x in range(0,len(s1time)):\n",
    "            sid1=int(s1time['Stage ID'][x])\n",
    "            sid2=int(stagesmap[sid1])\n",
    "            print(f\"============================================================\")\n",
    "            display(ldfx[ldfx['Stage ID']==sid1])\n",
    "            display(rdfx[ldfx['Stage ID']==sid2])\n",
    "            print(f\" Gazelle  Query {queryid}  Stage {sid1}\")\n",
    "            xf=appals.get_query_plan(stageid=sid1,show_simple_string=True)\n",
    "            print(f\" Photon  Query {queryid}  Stage {sid2}\")\n",
    "            xf=appbaseals.get_query_plan(stageid=sid2,show_simple_string=True)\n",
    "            acctime+=s1time['elapsed time'][x]\n",
    "            if acctime/totaltime>=0.9:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "notlist=['resource.executor.cores',\n",
    " 'spark.app.id',\n",
    " 'spark.app.initial.file.urls',\n",
    " 'spark.app.name',\n",
    " 'spark.app.startTime',\n",
    " 'spark.driver.port',\n",
    " 'spark.job.description',\n",
    " 'spark.jobGroup.id',\n",
    " 'spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n",
    " 'spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n",
    " 'spark.rdd.scope',\n",
    " 'spark.sql.execution.id',\n",
    " '__fetch_continuous_blocks_in_batch_enabled',\n",
    " 'spark.driver.appUIAddress'\n",
    " 'spark.driver.appUIAddress',\n",
    " 'spark.driver.host',\n",
    " 'spark.driver.appUIAddress',\n",
    " 'spark.driver.extraClassPath',\n",
    " 'spark.eventLog.dir',\n",
    " 'spark.executorEnv.CC',\n",
    " 'spark.executorEnv.LD_LIBRARY_PATH',\n",
    " 'spark.executorEnv.LD_PRELOAD',\n",
    " 'spark.executorEnv.LIBARROW_DIR',\n",
    " 'spark.files',\n",
    " 'spark.history.fs.logDirectory',\n",
    " 'spark.sql.warehouse.dir',\n",
    " 'spark.yarn.appMasterEnv.LD_PRELOAD',\n",
    " 'spark.yarn.dist.files'\n",
    "]\n",
    "def comp_spark_conf(app0,app1):   \n",
    "    pdf_sparkconf_0=app0.get_spark_config()\n",
    "    pdf_sparkconf_1=app1.get_spark_config()\n",
    "    pdfc=pdf_sparkconf_0.join(pdf_sparkconf_1,lsuffix=app0.appid[-8:],rsuffix=app1.appid[-8:])\n",
    "    pdfc[\"0\"+app0.appid[-8:]]=pdfc[\"0\"+app0.appid[-8:]].str.lower()\n",
    "    pdfc[\"0\"+app1.appid[-8:]]=pdfc[\"0\"+app1.appid[-8:]].str.lower()\n",
    "    \n",
    "    pdfc['comp']=(pdfc[\"0\"+app0.appid[-8:]]==pdfc[\"0\"+app1.appid[-8:]])\n",
    "    return pdfc.loc[(pdfc['comp']==False) & (~pdfc.index.isin(notlist))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Node log analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "@pandas_udf(\"host string, id string,taskid int, time double\", PandasUDFType.GROUPED_MAP)\n",
    "def collect_udf_time(pdf):\n",
    "    proxy_handler = request.ProxyHandler({})\n",
    "    opener = request.build_opener(proxy_handler)\n",
    "\n",
    "    rst=[]\n",
    "    for idx,l in pdf.iterrows():\n",
    "        ip=\"10.1.2.19\"+l['Host'][-1:]\n",
    "        execid=\"{:06d}\".format(int(l['Executor ID'])+1)\n",
    "        appid=l['appid']\n",
    "        url = f'http://{ip}:8042/node/containerlogs/container_{appid}_01_{execid}/sparkuser/stderr/?start=0'\n",
    "        # open the website with the opener\n",
    "        req = opener.open(url)\n",
    "        data = req.read().decode('utf8')\n",
    "        cnt=data.split(\"\\n\")\n",
    "        cnt_udf=[l.split(\" \") for l in cnt if l.startswith('start UDF') or l.startswith('stop UDF')]\n",
    "        unf_pdf=pandas.DataFrame(cnt_udf)\n",
    "        srst=unf_pdf.loc[:,[0,4,6]]\n",
    "        srst.columns=['id','taskid','time']\n",
    "        srst['host']=l['Host']\n",
    "        srst['taskid']=srst['taskid'].astype(int)\n",
    "        srst['time']=srst['time'].apply(lambda f: float(re.search('\\d+\\.\\d+',f).group(0)))\n",
    "        rst.append(srst)\n",
    "    return pandas.concat(rst)\n",
    "\n",
    "\n",
    "class App_Log_Analysis_Node_log(App_Log_Analysis):\n",
    "    def __init__(self, appid,jobids):\n",
    "        App_Log_Analysis.__init__(self, appid,jobids)\n",
    "    \n",
    "    def generate_trace_view_list(self,id=0, **kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        showcpu=kwargs['showcpu'] if 'showcpu' in kwargs else False\n",
    "        \n",
    "        appid=self.appid\n",
    "        events=self.df.toPandas()\n",
    "        coretrack={}\n",
    "        trace_events=[]\n",
    "        starttime=0\n",
    "        taskend=[]\n",
    "        trace={\"traceEvents\":[]}\n",
    "        exec_hosts={}\n",
    "        hostsdf=self.df.select(\"Host\").distinct().orderBy(\"Host\")\n",
    "        hostid=100000\n",
    "        ended_event=[]\n",
    "\n",
    "        for i,l in hostsdf.toPandas().iterrows():\n",
    "            exec_hosts[l['Host']]=hostid\n",
    "            hostid=hostid+100000\n",
    "\n",
    "        tskmap={}\n",
    "        for idx,l in events.iterrows():\n",
    "            if l['Event']=='SparkListenerTaskStart':\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "\n",
    "                tsk=l['Task ID']\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                stime=l['Launch Time']\n",
    "                #the task's starttime and finishtime is the same, ignore it.\n",
    "                if tsk in ended_event:\n",
    "                    continue\n",
    "                if not pid in coretrack:\n",
    "                    tids={}\n",
    "                    trace_events.append({\n",
    "                       \"name\": \"process_name\",\n",
    "                       \"ph\": \"M\",\n",
    "                       \"pid\":pid,\n",
    "                       \"tid\":0,\n",
    "                       \"args\":{\"name\":\"{:s}.{:s}\".format(l['Host'],l['Executor ID'])}\n",
    "                      })\n",
    "\n",
    "                else:\n",
    "                    tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==-1:\n",
    "                        tids[t]=[tsk,stime]\n",
    "                        break\n",
    "                else:\n",
    "                    t=len(tids)\n",
    "                    tids[t]=[tsk,stime]\n",
    "                #print(\"task {:d} tid is {:s}.{:d}\".format(tsk,pid,t))\n",
    "                coretrack[pid]=tids\n",
    "\n",
    "            if l['Event']=='SparkListenerTaskEnd':\n",
    "                sevt={}\n",
    "                eevt={}\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                tsk=l['Task ID']\n",
    "                fintime=l['Finish Time']\n",
    "\n",
    "                tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==tsk:\n",
    "                        tids[t]=[-1,-1]\n",
    "                        break\n",
    "                else:\n",
    "                    ended_event.append(tsk)\n",
    "                    continue\n",
    "                for ps in reversed([key for key in tids.keys()]) :\n",
    "                    if tids[ps][1]-fintime<0 and tids[ps][1]-fintime>=-2:\n",
    "                        fintime=tids[ps][1]\n",
    "                        tids[t]=tids[ps]\n",
    "                        tids[ps]=[-1,-1]\n",
    "                        break\n",
    "                if starttime==0:\n",
    "                    starttime=l['Launch Time']\n",
    "\n",
    "                sstime=l['Launch Time']-starttime\n",
    "\n",
    "                trace_events.append({\n",
    "                       'tid':pid+int(t),\n",
    "                       'ts':sstime,\n",
    "                       'dur':fintime-l['Launch Time'],\n",
    "                       'pid':pid,\n",
    "                       \"ph\":'X',\n",
    "                       'name':\"stg{:d}\".format(l['Stage ID']),\n",
    "                       'args':{\"job id\": l['job id'],\n",
    "                               \"stage id\": l['Stage ID'],\n",
    "                               \"tskid\":tsk,\n",
    "                               \"input\":builtins.round(l[\"Bytes Read\"]/1024/1024,2),\n",
    "                               \"spill\":builtins.round(l[\"Memory Bytes Spilled\"]/1024/1024,2),\n",
    "                               \"Shuffle Read Metrics\": \"\",\n",
    "                               \"|---Local Read\": builtins.round(l[\"Local Bytes Read\"]/1024/1024,2),\n",
    "                               \"|---Remote Read\":builtins.round(l[\"Remote Bytes Read\"]/1024/1024,2),\n",
    "                               \"Shuffle Write Metrics\": \"\",\n",
    "                               \"|---Write\":builtins.round(l['Shuffle Bytes Written']/1024/1024,2)\n",
    "                               }\n",
    "                      })\n",
    "                tskmap[tsk]={'pid':pid,'tid':pid+int(t)}\n",
    "\n",
    "        self.starttime=starttime\n",
    "        self.tskmap=tskmap\n",
    "\n",
    "        hostdf=self.df.select('Host','Executor ID',F.lit(appid[len('application_'):]).alias('appid')).distinct().orderBy('Host')\n",
    "        rst=hostdf.groupBy('Host').apply(collect_udf_time)\n",
    "        rst.cache()\n",
    "        start_df=rst.where(\"id='start'\").select(F.col('taskid').alias('start_taskid'),F.col('time').alias(\"starttime\"))\n",
    "        stop_df=rst.where(\"id='stop'\").select('taskid',F.col('time').alias(\"stop_time\"))\n",
    "        df=start_df.join(stop_df, on=[start_df.start_taskid==stop_df.taskid,stop_df['stop_time']>=start_df['starttime']],how='left').groupBy('taskid','starttime').agg(F.min('stop_time').alias('stop_time'))\n",
    "        pdf=df.toPandas() \n",
    "        for idx,l in pdf.iterrows():\n",
    "                trace_events.append({\n",
    "                     'tid':self.tskmap[l['taskid']]['tid'],\n",
    "                     'ts':l['starttime']*1000-self.starttime,\n",
    "                     'dur':(l['stop_time']-l['starttime'])*1000,                \n",
    "                     'pid':self.tskmap[l['taskid']]['pid'],\n",
    "                     'ph':'X',\n",
    "                     'name':'udf'})\n",
    "        \n",
    "        return [json.dumps(l) for l in trace_events]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class App_Log_Analysis_Node_log(App_Log_Analysis):\n",
    "    def __init__(self, appid,jobids):\n",
    "        App_Log_Analysis.__init__(self, appid,jobids)\n",
    "    \n",
    "    def generate_trace_view_list(self,id=0, **kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        showcpu=kwargs['showcpu'] if 'showcpu' in kwargs else False\n",
    "        \n",
    "        appid=self.appid\n",
    "        events=self.df.toPandas()\n",
    "        coretrack={}\n",
    "        trace_events=[]\n",
    "        starttime=0\n",
    "        taskend=[]\n",
    "        trace={\"traceEvents\":[]}\n",
    "        exec_hosts={}\n",
    "        hostsdf=self.df.select(\"Host\").distinct().orderBy(\"Host\")\n",
    "        hostid=100000\n",
    "        ended_event=[]\n",
    "\n",
    "        for i,l in hostsdf.toPandas().iterrows():\n",
    "            exec_hosts[l['Host']]=hostid\n",
    "            hostid=hostid+100000\n",
    "\n",
    "        tskmap={}\n",
    "        for idx,l in events.iterrows():\n",
    "            if l['Event']=='SparkListenerTaskStart':\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "\n",
    "                tsk=l['Task ID']\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                stime=l['Launch Time']\n",
    "                #the task's starttime and finishtime is the same, ignore it.\n",
    "                if tsk in ended_event:\n",
    "                    continue\n",
    "                if not pid in coretrack:\n",
    "                    tids={}\n",
    "                    trace_events.append({\n",
    "                       \"name\": \"process_name\",\n",
    "                       \"ph\": \"M\",\n",
    "                       \"pid\":pid,\n",
    "                       \"tid\":0,\n",
    "                       \"args\":{\"name\":\"{:s}.{:s}\".format(l['Host'],l['Executor ID'])}\n",
    "                      })\n",
    "\n",
    "                else:\n",
    "                    tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==-1:\n",
    "                        tids[t]=[tsk,stime]\n",
    "                        break\n",
    "                else:\n",
    "                    t=len(tids)\n",
    "                    tids[t]=[tsk,stime]\n",
    "                #print(\"task {:d} tid is {:s}.{:d}\".format(tsk,pid,t))\n",
    "                coretrack[pid]=tids\n",
    "\n",
    "            if l['Event']=='SparkListenerTaskEnd':\n",
    "                sevt={}\n",
    "                eevt={}\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                tsk=l['Task ID']\n",
    "                fintime=l['Finish Time']\n",
    "\n",
    "                tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==tsk:\n",
    "                        tids[t]=[-1,-1]\n",
    "                        break\n",
    "                else:\n",
    "                    ended_event.append(tsk)\n",
    "                    continue\n",
    "                for ps in reversed([key for key in tids.keys()]) :\n",
    "                    if tids[ps][1]-fintime<0 and tids[ps][1]-fintime>=-2:\n",
    "                        fintime=tids[ps][1]\n",
    "                        tids[t]=tids[ps]\n",
    "                        tids[ps]=[-1,-1]\n",
    "                        break\n",
    "                if starttime==0:\n",
    "                    starttime=l['Launch Time']\n",
    "\n",
    "                sstime=l['Launch Time']-starttime\n",
    "\n",
    "                trace_events.append({\n",
    "                       'tid':pid+int(t),\n",
    "                       'ts':sstime,\n",
    "                       'dur':fintime-l['Launch Time'],\n",
    "                       'pid':pid,\n",
    "                       \"ph\":'X',\n",
    "                       'name':\"stg{:d}\".format(l['Stage ID']),\n",
    "                       'args':{\"job id\": l['job id'],\n",
    "                               \"stage id\": l['Stage ID'],\n",
    "                               \"tskid\":tsk,\n",
    "                               \"input\":builtins.round(l[\"Bytes Read\"]/1024/1024,2),\n",
    "                               \"spill\":builtins.round(l[\"Memory Bytes Spilled\"]/1024/1024,2),\n",
    "                               \"Shuffle Read Metrics\": \"\",\n",
    "                               \"|---Local Read\": builtins.round(l[\"Local Bytes Read\"]/1024/1024,2),\n",
    "                               \"|---Remote Read\":builtins.round(l[\"Remote Bytes Read\"]/1024/1024,2),\n",
    "                               \"Shuffle Write Metrics\": \"\",\n",
    "                               \"|---Write\":builtins.round(l['Shuffle Bytes Written']/1024/1024,2)\n",
    "                               }\n",
    "                      })\n",
    "                tskmap[tsk]={'pid':pid,'tid':pid+int(t)}\n",
    "\n",
    "        self.starttime=starttime\n",
    "        self.tskmap=tskmap\n",
    "\n",
    "        hostdf=self.df.select('Host','Executor ID',F.lit(appid[len('application_'):]).alias('appid')).distinct().orderBy('Host')\n",
    "        rst=hostdf.groupBy('Host').apply(collect_udf_time)\n",
    "        rst.cache()\n",
    "        start_df=rst.where(\"id='start'\").select(F.col('taskid').alias('start_taskid'),F.col('time').alias(\"starttime\"))\n",
    "        stop_df=rst.where(\"id='stop'\").select('taskid',F.col('time').alias(\"stop_time\"))\n",
    "        df=start_df.join(stop_df, on=[start_df.start_taskid==stop_df.taskid,stop_df['stop_time']>=start_df['starttime']],how='left').groupBy('taskid','starttime').agg(F.min('stop_time').alias('stop_time'))\n",
    "        pdf=df.toPandas() \n",
    "        for idx,l in pdf.iterrows():\n",
    "                trace_events.append({\n",
    "                     'tid':self.tskmap[l['taskid']]['tid'],\n",
    "                     'ts':l['starttime']*1000-self.starttime,\n",
    "                     'dur':(l['stop_time']-l['starttime'])*1000,                \n",
    "                     'pid':self.tskmap[l['taskid']]['pid'],\n",
    "                     'ph':'X',\n",
    "                     'name':'udf'})\n",
    "        \n",
    "        return [json.dumps(l) for l in trace_events]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class App_Log_Analysis_Node_Log_Uni(App_Log_Analysis):\n",
    "    def __init__(self, file,jobids):\n",
    "        App_Log_Analysis.__init__(self, file,jobids)\n",
    "    \n",
    "    def generate_trace_view_list(self,id=0, **kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        showcpu=False\n",
    "        \n",
    "        shownodes=kwargs.get(\"shownodes\",None)\n",
    "\n",
    "        showdf=self.df #self.df.where(F.col(\"Host\").isin(shownodes)) if shownodes else self.df\n",
    "\n",
    "        events=showdf.drop(\"Accumulables\",\"Stage IDs\").orderBy(\"Launch Time\",\"Finish Time\").toPandas()\n",
    "        coretrack={}\n",
    "        trace_events=[]\n",
    "        starttime=0\n",
    "        taskend=[]\n",
    "        trace={\"traceEvents\":[]}\n",
    "        exec_hosts={}\n",
    "        hostsdf=showdf.select(\"Host\").distinct().orderBy(\"Host\")\n",
    "        hostid=100000\n",
    "        ended_event=[]\n",
    "\n",
    "        applog=os.path.splitext(self.file)[0]+\".stdout\"\n",
    "        logdfs=[]\n",
    "        if fs.exists(applog):\n",
    "            logdata=sc.textFile(os.path.splitext(self.file)[0]+\".stdout\",84)\n",
    "            logdf=logdata.mapPartitions(splits).toDF()\n",
    "            logdfs.append(logdf)\n",
    "\n",
    "        p=os.path.split(self.file)\n",
    "        for c in shownodes:\n",
    "            f=p[0]+\"/\"+c+\"/xgbtck.txt\"\n",
    "            if fs.exists(f):\n",
    "                logdata=sc.textFile(f,84)\n",
    "                logdf=logdata.mapPartitions(splits).toDF()\n",
    "                logdfs.append(logdf)\n",
    "        logdf=reduce(lambda l,r: l.concat(r),logdfs)\n",
    "        logdf=logdf.cache()\n",
    "        logdf.count()\n",
    "\n",
    "        firstrow=logdf.limit(1).collect()\n",
    "\n",
    "        for c in logdf.columns:\n",
    "            if firstrow[0][c]!=\"xgbtck\":\n",
    "                logdf=logdf.drop(c)\n",
    "            else:\n",
    "                break\n",
    "\n",
    "        usefulc=[\"xgbtck\",\"event\",\"ts\",\"elapsed\",\"threadid\",\"taskid\"]\n",
    "        for i in range(0,len(usefulc)):\n",
    "            logdf=logdf.withColumnRenamed(logdf.columns[i],usefulc[i])\n",
    "\n",
    "        logdf=logdf.where(F.col(\"event\").isin(['load_library','data_load','data_convert']))\n",
    "        \n",
    "        task_thread=logdf.where(\"event='data_convert'\").select(F.col(\"taskid\").astype(IntegerType()),F.col(\"threadid\").astype(IntegerType())).distinct().toPandas().set_index('taskid').to_dict('index')\n",
    "        #task_thread={}\n",
    "\n",
    "        for i,l in hostsdf.toPandas().iterrows():\n",
    "            exec_hosts[l['Host']]=hostid\n",
    "            hostid=hostid+100000\n",
    "\n",
    "        tskmap={}\n",
    "        for idx,l in events.iterrows():\n",
    "            if l['Event']=='SparkListenerTaskStart':\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "\n",
    "                tsk=l['Task ID']\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                stime=l['Launch Time']\n",
    "                #the task's starttime and finishtime is the same, ignore it.\n",
    "                if tsk in ended_event:\n",
    "                    continue\n",
    "                if not pid in coretrack:\n",
    "                    tids={}\n",
    "                    trace_events.append({\n",
    "                       \"name\": \"process_name\",\n",
    "                       \"ph\": \"M\",\n",
    "                       \"pid\":pid,\n",
    "                       \"tid\":0,\n",
    "                       \"args\":{\"name\":\"{:s}.{:s}\".format(l['Host'],l['Executor ID'])}\n",
    "                      })\n",
    "\n",
    "                else:\n",
    "                    tids=coretrack[pid]\n",
    "\n",
    "                tidarr=[tsk,stime]\n",
    "\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==-1:\n",
    "                        tids[t]=tidarr\n",
    "                        break\n",
    "                else:\n",
    "                    t=len(tids)\n",
    "                    tids[t]=tidarr\n",
    "                #print(\"task {:d} tid is {:s}.{:d}\".format(tsk,pid,t))\n",
    "                coretrack[pid]=tids\n",
    "\n",
    "            if l['Event']=='SparkListenerTaskEnd':\n",
    "                sevt={}\n",
    "                eevt={}\n",
    "                hostid=exec_hosts[l['Host']]\n",
    "                pid=int(l['Executor ID'])*100+hostid\n",
    "                tsk=l['Task ID']\n",
    "                fintime=l['Finish Time']\n",
    "\n",
    "                tids=coretrack[pid]\n",
    "                for t in tids.keys():\n",
    "                    if tids[t][0]==tsk:\n",
    "                        tids[t]=[-1,-1]\n",
    "                        break\n",
    "                else:\n",
    "                    ended_event.append(tsk)\n",
    "                    continue\n",
    "                for ps in reversed([key for key in tids.keys()]):\n",
    "                    if (tids[ps][1]-fintime<0 and tids[ps][1]-fintime>=-2) or \\\n",
    "                        (tsk in task_thread and tids[ps][0] in task_thread and task_thread[tsk][\"threadid\"]==task_thread[tids[ps][0]][\"threadid\"]):\n",
    "                        fintime=tids[ps][1]\n",
    "                        tids[t]=tids[ps]\n",
    "                        tids[ps]=[-1,-1]\n",
    "                        break\n",
    "                if starttime==0:\n",
    "                    starttime=l['Launch Time']\n",
    "\n",
    "                sstime=l['Launch Time']-starttime\n",
    "\n",
    "                trace_events.append({\n",
    "                       'tid':pid+int(t),\n",
    "                       'ts':sstime,\n",
    "                       'dur':fintime-l['Launch Time'],\n",
    "                       'pid':pid,\n",
    "                       \"ph\":'X',\n",
    "                       'name':\"stg{:d}\".format(l['Stage ID']),\n",
    "                       'args':{\"job id\": l['Job ID'],\n",
    "                               \"stage id\": l['Stage ID'],\n",
    "                               \"tskid\":tsk,\n",
    "                               \"input\":builtins.round(l[\"Bytes Read\"]/1024/1024,2),\n",
    "                               \"spill\":builtins.round(l[\"Memory Bytes Spilled\"]/1024/1024,2),\n",
    "                               \"Shuffle Read Metrics\": \"\",\n",
    "                               \"|---Local Read\": builtins.round(l[\"Local Bytes Read\"]/1024/1024,2),\n",
    "                               \"|---Remote Read\":builtins.round(l[\"Remote Bytes Read\"]/1024/1024,2),\n",
    "                               \"Shuffle Write Metrics\": \"\",\n",
    "                               \"|---Write\":builtins.round(l['Shuffle Bytes Written']/1024/1024,2)\n",
    "                               }\n",
    "                      })\n",
    "                tskmap[tsk]={'pid':pid,'tid':pid+int(t)}\n",
    "\n",
    "        self.starttime=starttime\n",
    "        self.tskmap=tskmap\n",
    "\n",
    "        tskmapdf = spark.createDataFrame(pandas.DataFrame(self.tskmap).T.reset_index())\n",
    "        logdf=logdf.withColumn(\"ts\",F.col(\"ts\").astype(LongType()))\n",
    "        logdf=logdf.withColumn(\"taskid\",F.col(\"taskid\").astype(LongType()))\n",
    "        logdf=logdf.withColumnRenamed(\"event\",'type')\n",
    "        mgd=logdf.join(tskmapdf,on=(F.col('taskid')==F.col(\"index\")),how=\"right\")\n",
    "        rstdf=mgd.select(F.col('tid').alias(\"tid\"),\n",
    "          (F.round(F.col('ts')-F.lit(self.starttime),3)).alias(\"ts\"),\n",
    "          F.round(F.col(\"elapsed\"),3).alias(\"dur\"),\n",
    "           F.lit(F.col('pid')).alias(\"pid\"),\n",
    "           F.lit(\"X\").alias(\"ph\"),\n",
    "           F.col(\"type\").alias(\"name\")\n",
    "           ).where(F.col(\"ts\").isNotNull()).orderBy('ts')\n",
    "\n",
    "        #        logdf=logdf.withColumn(\"type\",F.substring_index(\"event\",\"_\",1))\n",
    "        #        window= Window.partitionBy(logdf['taskid']).orderBy(\"type\",\"ts\")\n",
    "        #        logdfx=logdf.select(\"taskid\",\"event\",\"type\",\"ts\",F.lag('ts',1).over(window).alias(\"last\"),F.lag('rownum',1).over(window).alias(\"rownum\")).orderBy(\"taskid\",\"ts\").where(\"event like '%end'\")\n",
    "\n",
    "\n",
    "        output=[json.dumps(l) for l in trace_events]\n",
    "        output.extend(rstdf.toJSON().collect())\n",
    "\n",
    "        return output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# perf trace analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def split_trace(x):\n",
    "    fi=[]\n",
    "    for l in x:\n",
    "        rst1=re.search(r\"^(\\d+\\.\\d+).*sched:(sched_switch):.+:(\\d+) \\[\\d+\\] (\\S+) ==> .+:(\\d+) \"\"\",l)\n",
    "        rst2=re.search(r\"(\\d+\\.\\d+) \\( +(\\d+\\.\\d+) +ms\\):[^/]+/(\\d+) (recvfrom|sendto)\\(fd: \\d+<\\S+:\\[\\d+\\]>, \\S+: 0x[a-f0-9]+, \\S+: (\\d+)\",l)\n",
    "        rst3=re.search(r\"(\\d+\\.\\d+) \\( +\\): [^/]+/(\\d+) (recvfrom|sendto)\\(fd: \\d+<\\S+:\\[\\d+\\]>, \\S+: 0x[a-f0-9]+, \\S+: (\\d+)\",l)\n",
    "        rst4=re.search(r\"(\\d+\\.\\d+) \\( *(\\d+\\.\\d+) ms\\): [^/]+/(\\d+)  ... \\[continued\\]: (sendto|recvfrom|poll)\",l)\n",
    "        rst5=re.search(r\"(\\d+\\.\\d+) \\( +(\\d+\\.\\d+) +ms\\): [^/]+/(\\d+) (poll)\",l)\n",
    "        rst6=re.search(r\"(\\d+\\.\\d+) \\( +\\): [^/]+/(\\d+) (poll)\",l)\n",
    "\n",
    "        rstx=re.search(r\"(\\d+\\.\\d+)*sched:(sched_switch):.*prev_pid=(\\d+).*prev_state=(\\S+) ==> .*next_pid=(\\d+)\"\"\",l)\n",
    "        if not rst1:\n",
    "            rst1=rstx\n",
    "        \n",
    "        if rst1:\n",
    "            fi.append((rst1.group(1),rst1.group(2),rst1.group(3),rst1.group(4),rst1.group(5))) #time, switch, src, status, dst\n",
    "        elif rst2:\n",
    "            fi.append((rst2.group(1),rst2.group(4),rst2.group(3),rst2.group(2),rst2.group(5))) #time, sed/rcv, pid, ms, size \n",
    "        elif rst3:\n",
    "            fi.append((rst3.group(1),rst3.group(3),rst3.group(2),0, rst3.group(4)))             #time, sed/rcv, pid, 0, size\n",
    "        elif rst4:\n",
    "            fi.append((rst4.group(1),rst4.group(4),rst4.group(3),rst4.group(2), 0))              #time, sed/rcv, pid, ms, 0\n",
    "        elif rst5:\n",
    "            fi.append((rst5.group(1),rst5.group(4),rst5.group(3),rst5.group(2), 0))              #time, sed/rcv, pid, ms, 0\n",
    "        elif rst6:\n",
    "            fi.append((rst6.group(1),rst6.group(3),rst6.group(2),0, 0))              #time, sed/rcv, pid, ms0, 0\n",
    "        elif not re.match(r\"^ +?\",l):\n",
    "            fi.append((0,l,'','',''))\n",
    "    return iter(fi)\n",
    "                  \n",
    "\n",
    "\n",
    "class Perf_trace_analysis(Analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Analysis.__init__(self,sar_file)\n",
    "        self.starttime=None\n",
    "        \n",
    "    def load_data(self):\n",
    "        sardata=sc.textFile(self.file)\n",
    "        sardf=sardata.mapPartitions(split_trace).toDF()\n",
    "        display(sardf.where(\"_1=0\").limit(5).collect())\n",
    "        sardf=sardf.withColumn(\"_1\",F.col(\"_1\").astype(DoubleType()))\n",
    "        sardf=sardf.where(\"_1>0\")\n",
    "        starttime=sardf.agg(F.min(\"_1\")).collect()[0][0]\n",
    "        if self.starttime is None:\n",
    "            self.starttime=(float(starttime))\n",
    "        else:\n",
    "            paths=os.path.split(self.file)\n",
    "            if fs.exists(paths[0]+\"/uptime.txt\"):\n",
    "                with fs.open(paths[0]+\"/uptime.txt\") as f:\n",
    "                    strf=f.read().decode('ascii')\n",
    "                    print(\"input starttime:\",self.starttime,\"uptime:\",float(strf)*1000,\"record starttime:\",starttime)\n",
    "                    self.starttime=self.starttime-float(strf)*1000\n",
    "            else:\n",
    "                print(\"uptime.txt isn't found, wrong\")\n",
    "                return\n",
    "            \n",
    "        self.df=sardf\n",
    "        return sardf\n",
    "\n",
    "    def generate_sched_view_list(self,id=0,**kwargs):\n",
    "        sardf=self.df\n",
    "        starttime=self.starttime\n",
    "        starttime=starttime+kwargs.get(\"sched_time_offset\",0)\n",
    "        print(\"offset time\",starttime)\n",
    "        \n",
    "        swdf=sardf.where(\"_2='sched_switch'\")\n",
    "        \n",
    "        cputhreshold=kwargs.get(\"cpu_threshold\",0.1)\n",
    "        sched_cnt = kwargs.get(\"sched_cnt\",10)\n",
    "        \n",
    "        pidstat_tids=kwargs.get(\"pidstat_tids\",None)\n",
    "        pidstat_tids_txt=kwargs.get(\"pidstat_tids_txt\",\"sched_threads.txt\")\n",
    "        \n",
    "        if pidstat_tids:\n",
    "            if type(pidstat_tids) is list:\n",
    "                tids=pidstat_tids\n",
    "            else:\n",
    "                tids=[re.split(r'\\s+',t) for t in pidstat_tids.split(\"\\n\")]\n",
    "                tids=[t[3] for t in tids if len(t)>4]\n",
    "        else:\n",
    "            paths=os.path.split(self.file)\n",
    "            if fs.exists(paths[0]+\"/\"+pidstat_tids_txt):\n",
    "                with fs.open(paths[0]+\"/\"+pidstat_tids_txt) as f:\n",
    "                    tids=[l.strip() for l in f.read().decode('ascii').split(\"\\n\") if len(l)>0] \n",
    "            else:\n",
    "                print(\"Wrong, no pidstat_tids args and no sched_threads.txt file\")\n",
    "                return []\n",
    "        tidcnt=swdf.where(F.col(\"_5\").isin(tids)).groupBy(\"_5\").count()\n",
    "        tidm10=tidcnt.where(\"count>{:d}\".format(sched_cnt)).select(\"_5\").collect()\n",
    "        rtids=[t[0] for t in tidm10]\n",
    "        rtiddf=swdf.where(F.col(\"_5\").isin(rtids) | F.col(\"_3\").isin(rtids))\n",
    "        rtiddf=rtiddf.withColumn(\"_1\",F.col(\"_1\").astype(DoubleType())-starttime)\n",
    "        rtiddf=rtiddf.withColumn(\"_3\",F.col(\"_3\").astype(IntegerType()))\n",
    "        rtiddf=rtiddf.withColumn(\"_5\",F.col(\"_5\").astype(IntegerType()))\n",
    "        rtiddf=rtiddf.withColumn(\"_1\",F.round(F.col(\"_1\"),3))\n",
    "        rtidcol=rtiddf.collect()\n",
    "        tidmap={}\n",
    "        tidtotal={}\n",
    "        for t in rtids:\n",
    "            tidmap[int(t)]=0\n",
    "            tidtotal[int(t)]=0\n",
    "        trace_events=[]\n",
    "        mintime=rtidcol[0][\"_1\"]\n",
    "        maxtime=0\n",
    "        for r in rtidcol:\n",
    "            if r[\"_3\"] in tidtotal:\n",
    "                tidtotal[r[\"_3\"]]=tidtotal[r[\"_3\"]]+r[\"_1\"]-tidmap[r[\"_3\"]]\n",
    "                tidmap[r[\"_3\"]]=r[\"_1\"]\n",
    "                maxtime=r[\"_1\"]\n",
    "            if r[\"_5\"] in tidmap:\n",
    "                tidmap[r[\"_5\"]]=r[\"_1\"]\n",
    "        for r in rtidcol:\n",
    "            if r[\"_3\"] in tidmap and tidtotal[r[\"_3\"]]/(maxtime-mintime)>cputhreshold:\n",
    "                trace_events.append({\n",
    "                    'tid':r[\"_3\"],\n",
    "                     'ts':tidmap[r[\"_3\"]],\n",
    "                     'pid':id,\n",
    "                     'ph':'X',\n",
    "                     'dur':round(r[\"_1\"]-tidmap[r[\"_3\"]],3),\n",
    "                     'name':r[\"_4\"]\n",
    "                })\n",
    "\n",
    "                tidmap[r[\"_3\"]]=r[\"_1\"]\n",
    "            if r[\"_5\"] in tidmap:\n",
    "                tidmap[r[\"_5\"]]=r[\"_1\"]\n",
    "        return [json.dumps(l) for l in trace_events]\n",
    "\n",
    "    def generate_nic_view_list(self,id=0,**kwargs):\n",
    "        sardf=self.df\n",
    "        starttime=self.starttime\n",
    "        starttime=starttime+kwargs.get(\"sched_time_offset\",0)\n",
    "        print(\"offset time\",starttime)\n",
    "        \n",
    "        nicdf=sardf.where(\"_2<>'sched_switch'\")\n",
    "        cntdf=nicdf.where(\"_2='continued'\")\n",
    "        cntdf=cntdf.select(\"_1\",\"_3\",\"_4\").withColumnRenamed(\"_4\",\"cnt_4\")\n",
    "        nicdf=nicdf.join(cntdf,on=[\"_1\",\"_3\"],how=\"leftouter\")\n",
    "        nicdf=nicdf.where(\"_2<>'continued'\")\n",
    "        nicdf=nicdf.select(F.col(\"_1\"),F.col(\"_2\"),F.col(\"_3\"),F.when(F.col(\"cnt_4\").isNull(), F.col(\"_4\")).otherwise(F.col(\"cnt_4\")).alias(\"_4\"),F.col(\"_5\"))\n",
    "        nicdf=nicdf.withColumn(\"_1\",F.col(\"_1\").astype(DoubleType())-starttime)\n",
    "        nicdf=nicdf.withColumn(\"_3\",F.col(\"_3\").astype(IntegerType()))\n",
    "        nicdf=nicdf.withColumn(\"_5\",F.col(\"_5\").astype(IntegerType()))\n",
    "        nicdf=nicdf.withColumn(\"_1\",F.col(\"_1\").astype(IntegerType()))\n",
    "        nicdf=nicdf.withColumn(\"_4\",F.col(\"_4\").astype(DoubleType()))\n",
    "        nicdf=nicdf.withColumn(\"_4\",F.col(\"_4\").astype(LongType()))\n",
    "        return nicdf.select(\n",
    "                F.col(\"_3\").alias('tid'),\n",
    "                (F.col(\"_1\")).alias('ts'),\n",
    "                F.lit(0).alias('pid'),\n",
    "                F.lit('X').alias('ph'),\n",
    "                F.col(\"_4\").alias('dur'),\n",
    "                F.col(\"_2\").alias('name'),\n",
    "                F.struct(\n",
    "                    F.col(\"_5\").alias(\"size\")\n",
    "                ).alias('args')\n",
    "            ).toJSON().collect()\n",
    "    \n",
    "    def generate_trace_view_list(self,id=0,**kwargs):\n",
    "        trace_events=Analysis.generate_trace_view_list(self,id,**kwargs)\n",
    "        sardf=self.df\n",
    "        starttime=self.starttime\n",
    "        \n",
    "        events=self.generate_sched_view_list(id,**kwargs)\n",
    "        events.extend(self.generate_nic_view_list(id,**kwargs))\n",
    "        events.extend(trace_events)\n",
    "        \n",
    "#        events.extend(nicdf.where(\"_5>1000 and _2='sendto'\").select(\n",
    "#                 F.lit(0).alias('tid'),\n",
    "#                F.col(\"_1\").alias('ts'),\n",
    "#                F.lit(0).alias('pid'),\n",
    "#                F.lit('i').alias('ph'),\n",
    "#                F.col(\"_2\").alias('name'),\n",
    "#                F.lit(\"g\").alias(\"s\")\n",
    "#            ).toJSON().collect())\n",
    "\n",
    "\n",
    "        return events\n",
    "                      "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Sar analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def splits(x):\n",
    "    fi=[]\n",
    "    for l in x:\n",
    "        li=re.split(r'\\s+',l)\n",
    "        for j in range(len(li),118):\n",
    "            li.append('')\n",
    "        fi.append(li)\n",
    "    return iter(fi)\n",
    "\n",
    "class Sar_analysis(Analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def load_data(self):\n",
    "        sardata=sc.textFile(self.file)\n",
    "        sardf=sardata.mapPartitions(splits).toDF()\n",
    "        sardf=sardf.where(\"_1<>'Average:'\")\n",
    "        \n",
    "        colstart=1;\n",
    "        ampm=sardf.where(\"_2='AM' or _2='PM'\").count()\n",
    "        if ampm==0:\n",
    "            for i in range(len(sardf.columns),1,-1):\n",
    "                sardf=sardf.withColumnRenamed(f'_{i}',f'_{i+1}')\n",
    "            self.timeformat='yyyy-MM-dd HH:mm:ss '\n",
    "            sardf=sardf.withColumn('_2',F.lit(''))\n",
    "            #print('no PM/AM')\n",
    "            colstart=1\n",
    "        else:\n",
    "            self.timeformat='yyyy-MM-dd hh:mm:ss a'\n",
    "            colstart=2\n",
    "            #print('with PM/AM')\n",
    "        \n",
    "        f=fs.open(self.file)\n",
    "        t=f.readline()\n",
    "        t=f.readline()\n",
    "        while len(t)==1:\n",
    "            t=f.readline()\n",
    "        cols=t.decode('ascii')\n",
    "        li=re.split(r'\\s+',cols)\n",
    "        ci=3;\n",
    "        for c in li[colstart:]:\n",
    "            sardf=sardf.withColumnRenamed(f\"_{ci}\",c)\n",
    "            ci=ci+1\n",
    "            \n",
    "        sardf=sardf.where(F.col(li[-2])!=li[-2]).where(F.col(\"_1\")!=F.lit(\"Linux\"))        \n",
    "        \n",
    "        sardf.cache()\n",
    "        self.df=sardf\n",
    "        \n",
    "        self.sarversion=\"\"\n",
    "        paths=os.path.split(self.file)\n",
    "        if fs.exists(paths[0]+\"/sarv.txt\"):\n",
    "            with fs.open(paths[0]+\"/sarv.txt\") as f:\n",
    "                allcnt = f.read().decode('ascii')\n",
    "                #print(allcnt)\n",
    "                self.sarversion=allcnt.split(\"\\n\")[0].split(\" \")[2]\n",
    "        \n",
    "        return sardf\n",
    "\n",
    "    def col_df(self,cond,colname,args,slaver_id=0, thread_id=0):\n",
    "        sardf=self.df\n",
    "        starttime=self.starttime\n",
    "        cpudf=sardf.where(cond)\n",
    "        #cpudf.select(F.date_format(F.from_unixtime(F.lit(starttime/1000)), 'yyyy-MM-dd HH:mm:ss').alias('starttime'),'_1').show(1)\n",
    "\n",
    "        cpudf=cpudf.withColumn('time',F.unix_timestamp(F.concat_ws(' ',F.date_format(F.from_unixtime(F.lit(starttime/1000)), 'yyyy-MM-dd'),F.col('_1'),F.col('_2')),self.timeformat))\n",
    "\n",
    "        cols=cpudf.columns\n",
    "                \n",
    "        cpudf=cpudf.groupBy('time').agg(\n",
    "            F.sum(F.when(F.col(cols[1]).rlike('^\\d+(\\.\\d+)*$'),F.col(cols[1]).astype(FloatType())).otherwise(0)).alias(cols[1]),\n",
    "            F.sum(F.when(F.col(cols[2]).rlike('^\\d+(\\.\\d+)*$'),F.col(cols[2]).astype(FloatType())).otherwise(0)).alias(cols[2]),\n",
    "            *[F.sum(F.col(c)).alias(c) for c in cols[3:] if not c.startswith(\"_\") and c!=\"\" and c!=\"time\"]\n",
    "        )\n",
    "        \n",
    "        traces=cpudf.orderBy(F.col(\"time\")).select(\n",
    "                F.lit(thread_id).alias('tid'),\n",
    "                (F.expr(\"time*1000\")-F.lit(self.starttime)).astype(IntegerType()).alias('ts'),\n",
    "                F.lit(slaver_id).alias('pid'),\n",
    "                F.lit('C').alias('ph'),\n",
    "                F.lit(colname).alias('name'),\n",
    "                args(cpudf).alias('args')\n",
    "            ).toJSON().collect()\n",
    "        return traces\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_events=Analysis.generate_trace_view_list(self,id, **kwargs)\n",
    "        return trace_events\n",
    "\n",
    "    def get_stat(self,**kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "            \n",
    "class Sar_cpu_analysis(Sar_analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Sar_analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_events=Sar_analysis.generate_trace_view_list(self,id, **kwargs)\n",
    "        \n",
    "        self.df=self.df.withColumn(\"%iowait\",F.when(F.col(\"%iowait\")>100,F.lit(100)).otherwise(F.col(\"%iowait\")))\n",
    "        \n",
    "        trace_events.extend(self.col_df(\"CPU='all'\",             \"all cpu%\",    lambda l: F.struct(\n",
    "                                                                                                    F.floor(F.col('%user').astype(FloatType())).alias('user'),\n",
    "                                                                                                    F.floor(F.col('%system').astype(FloatType())).alias('system'),\n",
    "                                                                                                    F.floor(F.col('%iowait').astype(FloatType())).alias('iowait')\n",
    "                                                                                                    ),                            id, 0))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":0,\"args\":{\"sort_index \":0}}))\n",
    "        \n",
    "        return trace_events    \n",
    "    def get_stat(sar_cpu,**kwargs):\n",
    "        Sar_analysis.get_stat(sar_cpu)\n",
    "        \n",
    "        cpuutil=sar_cpu.df.where(\"CPU='all'\").groupBy(\"_1\").agg(*[F.mean(F.col(l).astype(FloatType())).alias(l) for l in [\"%user\",\"%system\",\"%iowait\"]]).orderBy(\"_1\")\n",
    "        cnt=cpuutil.count()\n",
    "        user_morethan_90=cpuutil.where(\"`%user`>0.9\").count()\n",
    "        kernel_morethan_10=cpuutil.where(\"`%system`>0.1\").count()\n",
    "        iowait_morethan_10=cpuutil.where(\"`%iowait`>0.1\").count()\n",
    "        out=[['%user>90%',user_morethan_90/cnt],['%kernel>10%',kernel_morethan_10/cnt],[\"%iowait>10%\",iowait_morethan_10/cnt]]\n",
    "        avgutil=cpuutil.agg(*[F.mean(l).alias(l) for l in [\"%user\",\"%system\",\"%iowait\"]]).collect()\n",
    "        out.extend([[\"avg \" + l,avgutil[0][l]] for l in [\"%user\",\"%system\",\"%iowait\"]])\n",
    "        pdout=pandas.DataFrame(out).set_index(0)\n",
    "        pdout.columns=[sar_cpu.file.split(\"/\")[-2]]\n",
    "        return pdout\n",
    "    \n",
    "class Sar_mem_analysis(Sar_analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Sar_analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def load_data(self):\n",
    "        Sar_analysis.load_data(self)\n",
    "        sarv=[int(l) for l in self.sarversion.split(\".\")]\n",
    "        if sarv[0]>=12 and sarv[1]>=2:\n",
    "            self.df=self.df.withColumn(\"kbrealused\",F.col(\"kbmemused\"))\n",
    "        else:\n",
    "            # sar 10.1.5, sar 11.6.1\n",
    "            self.df=self.df.withColumn(\"kbrealused\",F.col(\"kbmemused\")-F.col(\"kbcached\")-F.col(\"kbbuffers\"))\n",
    "    \n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_events=Sar_analysis.generate_trace_view_list(self,id, **kwargs)\n",
    "        \n",
    "        \n",
    "        trace_events.extend(self.col_df(F.col('kbmemfree').rlike('^\\d+$'),\"mem % \",      lambda l: F.struct(F.floor(l['kbcached']*l['%memused']/l['kbmemused']).alias('cached'),  # kbcached / (kbmemfree+kbmemused)\n",
    "                                                                                                       F.floor(l['kbbuffers']*l['%memused']/l['kbmemused']).alias('buffered'),# kbbuffers / (kbmemfree+kbmemused)\n",
    "                                                                                                       F.floor(l['kbrealused']*l['%memused']/l['kbmemused']).alias('used')), # (%memused- kbcached-kbbuffers )/  (kbmemfree+kbmemused)\n",
    "                                          id,1))\n",
    "        #trace_events.extend(self.col_df(self.df._3.rlike('^\\d+$'),\"mem cmt % \",  lambda l: F.struct(F.floor(l._8*F.lit(100)/(l._3+l._4)).alias('commit/phy'),\n",
    "        #                                                                                                   F.floor(l._10-l._8*F.lit(100)/(l._3+l._4)).alias('commit/all')),                                                             id))\n",
    "        trace_events.extend(self.col_df(F.col('kbmemfree').rlike('^\\d+$'),\"pagecache % \",      lambda l: F.struct(F.floor((l['kbcached']-l['kbdirty'])*l['%memused']/l['kbmemused']).alias('clean'), \n",
    "                                                                                                       F.floor(l['kbdirty']*l['%memused']/l['kbmemused']).alias('dirty')),\n",
    "                                          id,2))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":1,\"args\":{\"sort_index \":1}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":2,\"args\":{\"sort_index \":2}}))\n",
    "        return trace_events    \n",
    "    def get_stat(sar_mem,**kwargs):\n",
    "        Sar_analysis.get_stat(sar_mem)\n",
    "        \n",
    "        memutil=sar_mem.df.where(F.col('kbmemfree').rlike('^\\d+$')).select(F.floor(F.col('kbcached').astype(FloatType())*F.lit(100)*F.col('%memused')/F.col('kbmemused')).alias('cached'),  \n",
    "                                                                                   F.floor(F.col('kbbuffers').astype(FloatType())*F.lit(100)*F.col('%memused')/F.col('kbmemused')).alias('buffered'),\n",
    "                                                                                   F.floor(F.col('kbrealused').astype(FloatType())*F.lit(100)*F.col('%memused')/F.col('kbmemused')).alias('used'),\n",
    "                                                                                   F.floor(F.col('kbdirty').astype(FloatType())*F.lit(100)*F.col('%memused')/F.col('kbmemused')).alias('dirty'))\n",
    "        memsum=memutil.summary().toPandas()\n",
    "        memsum=memsum.set_index(\"summary\")\n",
    "        out=[\n",
    "            [[l + ' mean',float(memsum[l][\"mean\"])],\n",
    "            [l + ' 75%',float(memsum[l][\"75%\"])],\n",
    "            [l + ' max',float(memsum[l][\"max\"])]] for l in [\"cached\",\"used\",\"dirty\"]]\n",
    "        out=[*out[0],*out[1]]\n",
    "        pdout=pandas.DataFrame(out).set_index(0)\n",
    "        pdout.columns=[sar_mem.file.split(\"/\")[-2]]\n",
    "        return pdout\n",
    "    \n",
    "class Sar_PageCache_analysis(Sar_analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Sar_analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def load_data(self):\n",
    "        Sar_analysis.load_data(self)\n",
    "    \n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_events=Sar_analysis.generate_trace_view_list(self,id, **kwargs)\n",
    "        \n",
    "        \n",
    "        trace_events.extend(self.col_df(F.col('pgpgin/s').rlike('^\\d'),\"page inout\",      lambda l: F.struct(\n",
    "                                                                                                       F.floor(l['pgpgin/s']/1024).alias('in'),\n",
    "                                                                                                       F.floor(l['pgpgout/s']/1024).alias('out')),\n",
    "                                          id,11))\n",
    "        trace_events.extend(self.col_df(F.col('pgpgin/s').rlike('^\\d'),\"faults\",      lambda l: F.struct(F.floor((l['majflt/s'])).alias('major'), \n",
    "                                                                                                       F.floor(l['fault/s']-l['majflt/s']).alias('minor')),\n",
    "                                          id,12))\n",
    "        trace_events.extend(self.col_df(F.col('pgpgin/s').rlike('^\\d'),\"page free\",      lambda l: F.struct(F.floor((l['pgfree/s']*4/1024)).alias('free')),\n",
    "                                          id,13))\n",
    "        trace_events.extend(self.col_df(F.col('pgpgin/s').rlike('^\\d'),\"scan\",      lambda l: F.struct(F.floor((l['pgscank/s'])*4/1024).alias('kernel'), \n",
    "                                                                                                       F.floor(l['pgscand/s']*4/1024).alias('app')),\n",
    "                                          id,14))\n",
    "        trace_events.extend(self.col_df(F.col('pgpgin/s').rlike('^\\d'),\"vmeff\",      lambda l: F.struct(F.floor((l['%vmeff'])).alias('steal')),\n",
    "                                          id,15))\n",
    "        \n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":11,\"args\":{\"sort_index \":11}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":12,\"args\":{\"sort_index \":12}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":13,\"args\":{\"sort_index \":13}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":14,\"args\":{\"sort_index \":14}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":15,\"args\":{\"sort_index \":15}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":16,\"args\":{\"sort_index \":16}}))\n",
    "        return trace_events    \n",
    "    def get_stat(sar_mem,**kwargs):\n",
    "        Sar_analysis.get_stat(sar_mem)\n",
    "        \n",
    "        memutil=sar_mem.df.where(F.col('pgpgin/s').rlike('^\\d')).select(F.floor(F.col('pgpgin/s').astype(FloatType())/1024).alias('pgin'),  \n",
    "                                                                                   F.floor(F.col('pgpgout/s').astype(FloatType())/1024).alias('pgout'),\n",
    "                                                                                   F.floor(F.col('fault/s').astype(FloatType())-F.col('majflt/s').astype(FloatType())).alias('fault')\n",
    "                                                                                   )\n",
    "        memsum=memutil.summary().toPandas()\n",
    "        memsum=memsum.set_index(\"summary\")\n",
    "        out=[\n",
    "            [[l + ' mean',float(memsum[l][\"mean\"])],\n",
    "            [l + ' 75%',float(memsum[l][\"75%\"])],\n",
    "            [l + ' max',float(memsum[l][\"max\"])]] for l in [\"pgin\",\"pgout\",\"fault\"]]\n",
    "        out=[*out[0],*out[1],*out[2]]\n",
    "        pdout=pandas.DataFrame(out).set_index(0)\n",
    "        pdout.columns=[sar_mem.file.split(\"/\")[-2]]\n",
    "        return pdout\n",
    "    \n",
    "    \n",
    "class Sar_disk_analysis(Sar_analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Sar_analysis.__init__(self,sar_file)\n",
    "        \n",
    "    def load_data(self):\n",
    "        Sar_analysis.load_data(self)\n",
    "        \n",
    "        self.df=self.df.withColumn(\"%util\",F.col(\"%util\").astype(IntegerType()))\n",
    "        used_disk=self.df.groupBy(\"DEV\").agg(F.max(F.col(\"%util\")).alias(\"max_util\"),F.mean(\"%util\").alias(\"avg_util\")).where(F.col(\"max_util\")>10).collect()\n",
    "        self.df=self.df.where(F.col(\"DEV\").isin([l['DEV'] for l in used_disk]))\n",
    "        #print(\"used disks with its max util% and avg util% are: \")\n",
    "        #display([(l['DEV'],l[\"max_util\"],l[\"avg_util\"]) for l in used_disk])\n",
    "        \n",
    "        if \"rd_sec/s\" in self.df.columns:\n",
    "            self.df=self.df.withColumn(\"rkB/s\",F.expr('cast(`rd_sec/s` as float)*512/1024'))\n",
    "        if \"wr_sec/s\" in self.df.columns:\n",
    "            self.df=self.df.withColumn(\"wkB/s\",F.expr('cast(`wr_sec/s` as float)*512/1024'))\n",
    "        \n",
    "        if \"areq-sz\" in self.df.columns:\n",
    "            self.df=self.df.withColumnRenamed(\"areq-sz\",\"avgrq-sz\")\n",
    "        if \"aqu-sz\" in self.df.columns:\n",
    "            self.df=self.df.withColumnRenamed(\"aqu-sz\",\"avgqu-sz\")\n",
    "            \n",
    "        if \"rkB/s\" in self.df.columns:\n",
    "            self.df=self.df.withColumn(\"rkB/s\",F.expr('cast(`rkB/s` as float)/1024'))\n",
    "        if \"wkB/s\" in self.df.columns:\n",
    "            self.df=self.df.withColumn(\"wkB/s\",F.expr('cast(`wkB/s` as float)/1024'))\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_events=Sar_analysis.generate_trace_view_list(self,id, **kwargs)\n",
    "\n",
    "        disk_prefix=kwargs.get('disk_prefix',\"\")\n",
    "        \n",
    "        if type(disk_prefix)==str:\n",
    "            diskfilter = \"DEV like '\"+disk_prefix+\"%'\"\n",
    "        elif type(disk_prefix)==list:\n",
    "            diskfilter = \"DEV in (\"+\",\".join(disk_prefix)+\")\"\n",
    "        else:\n",
    "            diskfilter = \"DEV like '%'\"\n",
    "\n",
    "        print(diskfilter)\n",
    "        devcnt=self.df.where(diskfilter).select(\"DEV\").distinct().count()\n",
    "        \n",
    "        trace_events.extend(self.col_df(diskfilter,      \"disk b/w\",       lambda l: F.struct(\n",
    "                                                                                                            F.floor(F.col(\"rKB/s\")).alias('read'),\n",
    "                                                                                                            F.floor(F.col(\"wKB/s\")).alias('write')),id, 3))\n",
    "        trace_events.extend(self.col_df(diskfilter,      \"disk%\",       lambda l: F.struct(\n",
    "                                                                                                            (F.col(\"%util\")/F.lit(devcnt)).alias('%util')),id, 4))\n",
    "        trace_events.extend(self.col_df(diskfilter,      \"req size\",       lambda l: F.struct(\n",
    "                                                                                                            (F.col(\"avgrq-sz\")/F.lit(devcnt)).alias('avgrq-sz')),id, 5))\n",
    "        trace_events.extend(self.col_df(diskfilter,      \"queue size\",       lambda l: F.struct(\n",
    "                                                                                                            (F.col(\"avgqu-sz\")/F.lit(512*devcnt/1024)).alias('avgqu-sz')),id, 6))\n",
    "        trace_events.extend(self.col_df(diskfilter,      \"await\",       lambda l: F.struct(\n",
    "                                                                                                            (F.col(\"await\")/F.lit(devcnt)).alias('await')),id,7))\n",
    "        \n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":3,\"args\":{\"sort_index \":3}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":4,\"args\":{\"sort_index \":4}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":5,\"args\":{\"sort_index \":5}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":6,\"args\":{\"sort_index \":6}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":7,\"args\":{\"sort_index \":7}}))\n",
    "        return trace_events    \n",
    "\n",
    "    def get_stat(sar_disk,**kwargs):\n",
    "        Sar_analysis.get_stat(sar_disk)\n",
    "        disk_prefix=kwargs.get('disk_prefix',\"\")\n",
    "        \n",
    "        if type(disk_prefix)==str:\n",
    "            diskfilter = \"DEV like '\"+disk_prefix+\"%'\"\n",
    "        elif type(disk_prefix)==list:\n",
    "            diskfilter = \"DEV in (\"+\",\".join(disk_prefix)+\")\"\n",
    "        else:\n",
    "            diskfilter = \"DEV like '%'\"\n",
    "\n",
    "        diskutil=sar_disk.df.where(diskfilter).groupBy(\"_1\").agg(F.mean(F.col(\"%util\").astype(FloatType())).alias(\"%util\")).orderBy(\"_1\")\n",
    "        totalcnt=diskutil.count()\n",
    "        time_morethan_90=diskutil.where(F.col(\"%util\")>90).count()/totalcnt\n",
    "        avgutil=diskutil.agg(F.mean(\"%util\")).collect()\n",
    "        out=[[\"avg disk util\",avgutil[0][\"avg(%util)\"]],\n",
    "            [\"time more than 90%\", time_morethan_90]]\n",
    "        diskbw=sar_disk.df.where(diskfilter).groupBy(\"_1\").agg(F.sum(F.col(\"rKB/s\")).alias(\"rd_bw\"),F.sum(F.col(\"wKB/s\")).alias(\"wr_bw\"))\n",
    "        bw=diskbw.agg(F.sum(\"rd_bw\").alias(\"total read\"),F.sum(\"wr_bw\").alias(\"total write\"),F.mean(\"rd_bw\").alias(\"read bw\"),F.mean(\"wr_bw\").alias(\"write bw\"),F.max(\"rd_bw\").alias(\"max read\"),F.max(\"wr_bw\").alias(\"max write\")).collect()\n",
    "        maxread=bw[0][\"max read\"]\n",
    "        maxwrite=bw[0][\"max write\"]\n",
    "        rdstat, wrstat = diskbw.stat.approxQuantile(['rd_bw','wr_bw'],[0.75,0.95,0.99],0.0)\n",
    "        time_rd_morethan_95 = diskbw.where(F.col(\"rd_bw\")>rdstat[1]).count()/totalcnt\n",
    "        time_wr_morethan_95 = diskbw.where(F.col(\"wr_bw\")>rdstat[1]).count()/totalcnt\n",
    "        out.append(['total read (G)' , bw[0][\"total read\"]/1024])\n",
    "        out.append(['total write (G)', bw[0][\"total write\"]/1024])\n",
    "        out.append(['avg read bw (MB/s)', bw[0][\"read bw\"]])\n",
    "        out.append(['avg write bw (MB/s)', bw[0][\"write bw\"]])\n",
    "        out.append(['read bw %75', rdstat[0]])\n",
    "        out.append(['read bw %95', rdstat[1]])\n",
    "        out.append(['read bw max', rdstat[2]])\n",
    "        out.append(['time_rd_morethan_95', time_rd_morethan_95])\n",
    "        out.append(['write bw %75', wrstat[0]])\n",
    "        out.append(['write bw %95', wrstat[1]])\n",
    "        out.append(['write bw max', wrstat[2]])\n",
    "        out.append(['time_wr_morethan_95', time_wr_morethan_95])\n",
    "        pdout=pandas.DataFrame(out).set_index(0)\n",
    "        pdout.columns=[sar_disk.file.split(\"/\")[-2]]\n",
    "        return pdout\n",
    "    \n",
    "class Sar_nic_analysis(Sar_analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Sar_analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_events=Sar_analysis.generate_trace_view_list(self,id, **kwargs)\n",
    "        \n",
    "        nicfilter=\"\"\n",
    "        if 'nic_prefix' in kwargs.keys():\n",
    "            nicfilter= \"IFACE in (\" + \",\".join(kwargs.get('nic_prefix',[\"'eth3'\",\"'enp24s0f1'\"])) + \")\"\n",
    "        else:\n",
    "            nicfilter= \"IFACE != 'lo'\"\n",
    "        \n",
    "        trace_events.extend(self.col_df(nicfilter,       \"eth \",        lambda l: F.struct(F.floor(F.expr('cast(`rxkB/s` as float)/1024')).alias('rxmb/s'),F.floor(F.expr('cast(`txkB/s` as float)/1024')).alias('txmb/s')),                id, 8))\n",
    "        trace_events.extend(self.col_df(\"_3 like 'ib%'\",        \"ib \",        lambda l: F.struct(F.floor(F.expr('cast(`rxkB/s` as float)/1024')).alias('rxmb/s'),F.floor(F.expr('cast(`txkB/s` as float)/1024')).alias('txmb/s')),                id, 9))\n",
    "        trace_events.extend(self.col_df(\"_3 = 'lo'\",            \"lo \",         lambda l: F.struct(F.floor(F.expr('cast (`rxkB/s` as float)/1024')).alias('rxmb/s'),F.floor(F.expr('cast (`txkB/s` as float)/1024')).alias('txmb/s')),              id, 10))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":8,\"args\":{\"sort_index \":8}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":9,\"args\":{\"sort_index \":9}}))\n",
    "        trace_events.append(json.dumps({\"name\": \"thread_sort_index\",\"ph\": \"M\",\"pid\":id,\"tid\":10,\"args\":{\"sort_index \":10}}))\n",
    "        return trace_events  \n",
    "    \n",
    "    def get_stat(sar_nic,**kwargs):\n",
    "        Sar_analysis.get_stat(sar_nic)\n",
    "        nicfilter=\"\"\n",
    "        \n",
    "        if 'nic_prefix' in kwargs.keys():\n",
    "            nicfilter= \"IFACE in (\" + \",\".join(kwargs.get('nic_prefix',[\"'eth3'\",\"'enp24s0f1'\"])) + \")\"\n",
    "        else:\n",
    "            nicfilter= \"IFACE != 'lo'\"\n",
    "            \n",
    "        nicbw=sar_nic.df.where(nicfilter).groupBy(\"_1\").agg(F.sum(F.col(\"rxkB/s\").astype(FloatType())/1024).alias(\"rx MB/s\")).orderBy(\"_1\")\n",
    "        if nicbw.count()==0:\n",
    "            out=[[\"rx MB/s 75%\",0],[\"rx MB/s 95%\",0],[\"rx MB/s 99%\",0]]\n",
    "        else:\n",
    "            out=nicbw.stat.approxQuantile(['rx MB/s'],[0.75,0.95,0.99],0.0)[0]\n",
    "            out=[[\"rx MB/s 75%\",out[0]],[\"rx MB/s 95%\",out[1]],[\"rx MB/s 99%\",out[2]]]\n",
    "        pdout=pandas.DataFrame(out).set_index(0)\n",
    "        pdout.columns=[sar_nic.file.split(\"/\")[-2]]\n",
    "        return pdout"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# PID State analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Pidstat_analysis(Analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def load_data(self):\n",
    "        sardata=sc.textFile(self.file)\n",
    "        sardf=sardata.mapPartitions(splits).toDF()\n",
    "        sardf=sardf.where(\"_1<>'Average:'\")\n",
    "        \n",
    "        headers=sardf.where(\"_4='TID' or _5='TID'\").limit(1).collect()\n",
    "        r=headers[0].asDict()\n",
    "        findtime=False\n",
    "        for i,v in r.items():\n",
    "            if(v==\"Time\"):\n",
    "                findtime=True\n",
    "        if not findtime:\n",
    "            r[\"_1\"]=\"Time\"\n",
    "        for i,v in r.items():\n",
    "            if(v!=\"\"):\n",
    "                sardf=sardf.withColumnRenamed(i,v)\n",
    "        sardf=sardf.where(\"TGID='0' or TGID='-'\") \n",
    "\n",
    "        self.df=sardf\n",
    "        return sardf\n",
    "\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_list=Analysis.generate_trace_view_list(self,id,**kwargs)\n",
    "        sardf=self.df\n",
    "        starttime=self.starttime\n",
    "        \n",
    "        sardf=sardf.withColumn(\"%CPU\",F.col(\"%CPU\").astype(FloatType()))\n",
    "        sardf=sardf.withColumn(\"Time\",F.col(\"Time\").astype(LongType()))\n",
    "        sardf=sardf.withColumn(\"TID\",F.col(\"TID\").astype(LongType()))\n",
    "        hotthreads=sardf.where(\"`%CPU`>30\").groupBy(\"TID\").count().collect()\n",
    "        hts=[(r[0],r[1]) for r in hotthreads]\n",
    "        htc=[r[1] for r in hotthreads]\n",
    "        if len(htc)==0:\n",
    "            return trace_list\n",
    "        maxcnt=max(htc)\n",
    "        hts=[r[0] for r in hts if r[1]>maxcnt/2]\n",
    "        tdfs=list(map(lambda x: sardf.withColumnRenamed(\"TID\",\"TID_\"+str(x)).withColumnRenamed(\"%CPU\",\"CPU_\"+str(x)).where(F.col(\"TID\")==x).select(\"Time\",\"TID_\"+str(x),\"CPU_\"+str(x)),hts))\n",
    "        finaldf=reduce(lambda x,y: x.join(y,on=[\"Time\"]),tdfs)\n",
    "        othersdf=sardf.where(\"TID not in (\"+\",\".join(map(lambda x: str(x),hts))+\")\").groupBy(\"Time\").agg(F.sum(\"%CPU\").alias(\"CPU_Other\"))\n",
    "        finaldf=finaldf.join(othersdf,on=[\"Time\"])\n",
    "        finaldf=finaldf.orderBy(\"Time\")\n",
    "        hts.append(\"Other\")\n",
    "        stt=[F.col(\"CPU_\"+str(x)).alias(str(x)) for x in hts]\n",
    "        args=F.struct(*stt)\n",
    "        \n",
    "        trace_list.extend(finaldf.select(\n",
    "                F.lit(6).alias('tid'),\n",
    "                (F.expr(\"Time*1000\")-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "                F.lit(id).alias('pid'),\n",
    "                F.lit('C').alias('ph'),\n",
    "                F.lit(\"pidstat\").alias('name'),\n",
    "                args.alias('args')\n",
    "            ).toJSON().collect())\n",
    "        return trace_list\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Perf stat Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Perfstat_analysis(Analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def load_data(self):\n",
    "        sardata=sc.textFile(self.file)\n",
    "        sardf=sardata.mapPartitions(splits).toDF()\n",
    "        \n",
    "        paths=os.path.split(self.file)\n",
    "        if fs.exists(paths[0]+\"/perfstarttime\"):\n",
    "            with fs.open(paths[0]+\"/perfstarttime\") as f:\n",
    "                strf=f.read().decode('ascii')\n",
    "        else:\n",
    "            print(\"error, perfstarttime not found\")\n",
    "            return\n",
    "        \n",
    "        tsc_freq_file = os.path.join(paths[0], 'tsc_freq')\n",
    "        if fs.exists(tsc_freq_file):\n",
    "            self.tsc_freq = int(spark.read.text(tsc_freq_file).collect()[0][0])\n",
    "        else:\n",
    "            print(f'{tsc_freq_file} not exists')\n",
    "            return\n",
    "        \n",
    "        totalcores_file = os.path.join(paths[0], 'totalcores')\n",
    "        if fs.exists(totalcores_file):\n",
    "            self.totalcores = int(spark.read.text(totalcores_file).collect()[0][0])\n",
    "        else:\n",
    "            print(f'{totalcores_file} not exists')\n",
    "            return\n",
    "        \n",
    "        strf=strf[len(\"# started on \"):].strip()\n",
    "        starttime=datetime.strptime(strf, \"%a %b %d %H:%M:%S %Y\").timestamp()*1000\n",
    "        sardf=sardf.where(\"_1<>'#'\")\n",
    "        sardf=sardf.withColumn(\"ts\",F.col(\"_2\").astype(DoubleType())*1000+F.lit(starttime)).where(\"ts is not null\").select(\"ts\",\"_3\",\"_4\")\n",
    "        sardf=sardf.withColumn('_3', F.regexp_replace('_3', ',', '').astype(LongType()))\n",
    "        sardf=sardf.cache()\n",
    "        self.df=sardf\n",
    "        return sardf\n",
    "\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_list=Analysis.generate_trace_view_list(self,id,**kwargs)\n",
    "        sardf=self.df\n",
    "        starttime=self.starttime\n",
    "        \n",
    "        stringIndexer = StringIndexer(inputCol=\"_4\", outputCol=\"syscall_idx\")\n",
    "        model = stringIndexer.fit(sardf)\n",
    "        sardf=model.transform(sardf)\n",
    "        \n",
    "#        cnts=sardf.select(\"_4\").distinct().collect()\n",
    "#        cnts=[l['_4'] for l in cnts]\n",
    "#        cntmap={ cnts[i]:i  for i in range(0, len(cnts) ) }\n",
    "#        mapexpr=F.create_map([F.lit(x) for x in chain(*cntmap.items())])\n",
    "#        sardf.select(mapexpr.getItem(F.col(\"_4\")))\n",
    "        \n",
    "        sardf=sardf.withColumn(\"syscall_idx\",F.col(\"syscall_idx\").astype(IntegerType()))\n",
    "        \n",
    "        trace_list.extend(sardf.select(\n",
    "            (F.lit(100)+F.col(\"syscall_idx\")).alias('tid'),\n",
    "            (F.col(\"ts\")-F.lit(starttime)).astype(LongType()).alias('ts'),\n",
    "            F.lit(id).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.col(\"_4\").alias('name'),\n",
    "            F.struct(F.col(\"_3\").alias(\"cnt\")).alias('args')\n",
    "        ).toJSON().collect())\n",
    "        return trace_list\n",
    "    \n",
    "    def get_stat(self, **kwargs):\n",
    "        if self.df is None:\n",
    "            self.load_data()\n",
    "\n",
    "        raw_data = spark.read.text(self.file)\n",
    "\n",
    "        # Filter out non-data lines and split the data into columns\n",
    "        filtered_data = raw_data.filter(\n",
    "            ~raw_data.value.startswith('#') & raw_data.value.rlike(r\"^\\s*\\d\")\n",
    "        )\n",
    "\n",
    "        split_data = filtered_data.rdd.map(lambda row: row[0].split()).map(\n",
    "            lambda parts: (float(parts[0]), int(parts[1].replace(\",\", \"\")), parts[2], '' if len(parts) == 3 else parts[4])\n",
    "        )\n",
    "\n",
    "        schema = [\"time\", \"counts\", \"events\", \"ipc\"]\n",
    "        df = split_data.toDF(schema)\n",
    "\n",
    "        events_df = df.filter(col('ipc') == '')\n",
    "        ipc_df = df.filter(col('ipc') != '')\n",
    "\n",
    "        instructions = ipc_df.select(_sum(col(\"counts\"))).collect()[0][0] / 1e9\n",
    "        avg_ipc = ipc_df.select(avg(col(\"ipc\"))).collect()[0][0]\n",
    "\n",
    "        df_ccu_ref_tsc = events_df.select(col('time'), col('counts')).filter(col('events') == 'cpu_clk_unhalted.ref_tsc').withColumnRenamed('counts', 'cpu_clk_unhalted_ref_tsc')\n",
    "        df_ccu_thread = events_df.select(col('time'), col('counts')).filter(col('events') == 'cpu_clk_unhalted.thread').withColumnRenamed('counts', 'cpu_clk_unhalted_thread')\n",
    "\n",
    "        window_spec = Window.orderBy(\"time\")\n",
    "        df_ccu_ref_tsc = df_ccu_ref_tsc.withColumn(\"prev_time\", lag(\"time\").over(window_spec))\n",
    "        df_ccu_ref_tsc = df_ccu_ref_tsc.withColumn(\"prev_time\", when(col(\"prev_time\").isNull(), 0).otherwise(col(\"prev_time\")))\n",
    "        df_ccu_ref_tsc = df_ccu_ref_tsc.withColumn(\"tsc\", (col(\"time\") - col(\"prev_time\")) * self.tsc_freq)\n",
    "\n",
    "        joined_df = df_ccu_ref_tsc.join(df_ccu_thread, on=[\"time\"], how=\"inner\")\n",
    "        cpu_freq_df = joined_df.withColumn(\"freq\", joined_df.cpu_clk_unhalted_thread / joined_df.cpu_clk_unhalted_ref_tsc * self.tsc_freq / 1e9)\n",
    "        cpu_freq = cpu_freq_df.select(avg(col('freq'))).collect()[0][0]\n",
    "\n",
    "        cpu_util_df = df_ccu_ref_tsc.withColumn(\"cpu%\", col(\"cpu_clk_unhalted_ref_tsc\") / col(\"tsc\") / self.totalcores * 100)\n",
    "        cpu_util = cpu_util_df.select(avg(col('cpu%'))).collect()[0][0]\n",
    "\n",
    "        out = [['ipc', avg_ipc], ['instructions', instructions], ['cpu_freq', cpu_freq], ['cpu%', cpu_util]]\n",
    "        pdout=pandas.DataFrame(out).set_index(0)\n",
    "        \n",
    "        return pdout"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# GPU analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class gpu_analysis(Analysis):\n",
    "    def __init__(self,gpu_file):\n",
    "        Analysis.__init__(self,gpu_file)\n",
    "        \n",
    "    def load_data(self):\n",
    "        df_pf=spark.read.format(\"com.databricks.spark.csv\").option(\"header\",\"true\").option(\"mode\", \"DROPMALFORMED\").option(\"delimiter\", \",\").load(self.file)\n",
    "        df_pf2=df_pf.withColumn('timestamp',F.unix_timestamp(F.col('timestamp'),'yyyy/MM/dd HH:mm:ss')*1000+(F.split(F.col('timestamp'),'\\.')[1]).astype(IntegerType()))\n",
    "        df_pf2=df_pf2.withColumnRenamed(' utilization.gpu [%]','gpu_util')\n",
    "        df_pf2=df_pf2.withColumnRenamed(' utilization.memory [%]','mem_util')\n",
    "        df_pf2=df_pf2.withColumnRenamed(' memory.used [MiB]','mem_used')\n",
    "        df_pf2=df_pf2.withColumnRenamed(' index','index')\n",
    "        df_pf2=df_pf2.withColumn('gpu_util', (F.split('gpu_util',' ')[1]).astype(IntegerType()))\n",
    "        df_pf2=df_pf2.withColumn('mem_util', (F.split('mem_util',' ')[1]).astype(IntegerType()))\n",
    "        df_pf2=df_pf2.withColumn('mem_used', (F.split('mem_used',' ')[1]).astype(IntegerType()))\n",
    "        df_pf.cache()\n",
    "        self.df=df_pf2\n",
    "        return df_pf2\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        Analysis.generate_trace_view_list(self,id)\n",
    "            \n",
    "        df_pf2=self.df\n",
    "        starttime=self.starttime\n",
    "        trace_events=[]\n",
    "        \n",
    "        trace_events.extend(df_pf2.orderBy(df_pf2['timestamp']).select(\n",
    "            F.col('index').alias('tid'),\n",
    "            (F.expr(\"timestamp\")-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.lit(id).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.concat(F.lit('gpu_util_'),F.col('index')).alias('name'),\n",
    "            F.struct(F.col('gpu_util').alias('gpu')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        trace_events.extend(df_pf2.orderBy(df_pf2['timestamp']).select(\n",
    "            F.col('index').alias('tid'),\n",
    "            (F.expr(\"timestamp\")-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.lit(int(id)+1).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.concat(F.lit('mem_util_'),F.col('index')).alias('name'),\n",
    "            F.struct((F.col('mem_used')/F.lit(32768)).alias('mem')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        return trace_events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def splits_dmon(x):\n",
    "    fi=[]\n",
    "    for l in x:\n",
    "        l=l.strip()\n",
    "        if l.startswith('20'):\n",
    "            li=re.split(r'\\s+',l)\n",
    "            if len(li)==11:\n",
    "                fi.append(li)\n",
    "    return iter(fi)\n",
    "\n",
    "class gpu_dmon_analysis(Analysis):\n",
    "    def __init__(self,gpu_file):\n",
    "        Analysis.__init__(self,gpu_file)\n",
    "        \n",
    "    def load_data(self):\n",
    "        df_pf=sc.textFile(self.file)\n",
    "        df_pf=df_pf.mapPartitions(splits_dmon).toDF()\n",
    "        \n",
    "        df_pf2=df_pf.withColumn('_1',F.unix_timestamp(F.concat_ws(' ',F.col('_1'),F.col('_2')),'yyyyMMdd HH:mm:ss')*1000)\n",
    "        for c in range(3,12):\n",
    "            df_pf2=df_pf2.withColumn(f'_{c}',F.col(f'_{c}').astype(IntegerType()))\n",
    "\n",
    "        df_pf.cache()\n",
    "        self.df=df_pf2\n",
    "        return df_pf2\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        Analysis.generate_trace_view_list(self,id)\n",
    "\n",
    "        df_pf2=self.df\n",
    "        starttime=self.starttime\n",
    "        trace_events=[]\n",
    "        \n",
    "        trace_events.extend(df_pf2.orderBy(df_pf2['_1']).select(\n",
    "            F.col('_3').alias('tid'),\n",
    "            (F.expr(\"_1\")-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.lit(id).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.concat(F.lit('gpu_util_'),F.col('_3')).alias('name'),\n",
    "            F.struct(F.col('_4').alias('gpu')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        trace_events.extend(df_pf2.orderBy(df_pf2['_1']).select(\n",
    "            F.col('_3').alias('tid'),\n",
    "            (F.expr(\"_1\")-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.lit(id+1).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.concat(F.lit('mem_util_'),F.col('_3')).alias('name'),\n",
    "            F.struct(F.col('_5').alias('mem')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        trace_events.extend(df_pf2.orderBy(df_pf2['_1']).select(\n",
    "            F.col('_3').alias('tid'),\n",
    "            (F.expr(\"_1\")-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.lit(id+2).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.concat(F.lit('gpu_freq_'),F.col('_3')).alias('name'),\n",
    "            F.struct(F.col('_9').alias('gpu_freq')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        trace_events.extend(df_pf2.orderBy(df_pf2['_1']).select(\n",
    "            F.col('_3').alias('tid'),\n",
    "            (F.expr(\"_1\")-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.lit(id+3).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.concat(F.lit('pcie_'),F.col('_3')).alias('name'),\n",
    "            F.struct(F.col('_10').alias('tx'),F.col('_11').alias('rx')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        return trace_events\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# DASK analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def split_dask(x):\n",
    "    fi=[]\n",
    "    for l in x:\n",
    "        print(l)\n",
    "        li=[]\n",
    "        if l.startswith('('):\n",
    "            lx=re.split(r'[()]',l)\n",
    "            lv=lx[1]\n",
    "            p=re.search(r\"'(.*)-([0-9a-f]+)', *(\\d+)\",lv)\n",
    "            if not p:\n",
    "                print(\"dask log first field doesn't match (.*)-[0-9a-f]+', *(\\d+)\")\n",
    "                return\n",
    "            li.append(p.group(1))\n",
    "            li.extend(lx[2].split(\",\")[1:])\n",
    "            li.append(p.group(3))\n",
    "        else:\n",
    "            li=l.split(',')\n",
    "            p=re.search(r\"(.*)-([0-9a-f]+-[0-9a-f]+-[0-9a-f]+-[0-9a-f]+-[0-9a-f]+)$\",li[0])\n",
    "            if not p:\n",
    "                p=re.search(r\"(.*)-([0-9a-f]+)$\",li[0])\n",
    "            \n",
    "            li[0]=p.group(1)\n",
    "            li.append(p.group(2))\n",
    "        fi.append(li)\n",
    "    return iter(fi)\n",
    "\n",
    "class dask_analysis(Analysis):\n",
    "    def __init__(self,dask_file):\n",
    "        Analysis.__init__(self,dask_file)\n",
    "\n",
    "    def load_data(self):\n",
    "        rdds=sc.textFile(self.file)\n",
    "        df_pf=rdds.mapPartitions(split_dask).toDF()\n",
    "        df_pf=df_pf.withColumnRenamed('_1','_c0')\n",
    "        df_pf=df_pf.withColumnRenamed('_2','_c1')\n",
    "        df_pf=df_pf.withColumnRenamed('_3','_c2')\n",
    "        df_pf=df_pf.withColumnRenamed('_4','_c3')\n",
    "        df_pf=df_pf.withColumnRenamed('_5','_id')\n",
    "        \n",
    "        df_pf=df_pf.withColumn('_c1',F.split(F.col('_c1'),\":\")[2])\n",
    "        df_pf=df_pf.withColumn('_c3',df_pf._c3.astype(DoubleType())*1000) \n",
    "        df_pf=df_pf.withColumn('_c2',df_pf._c2.astype(DoubleType())*1000)\n",
    "        \n",
    "        df_pf.cache()\n",
    "        self.df=df_pf\n",
    "        self.starttime=df_pf.agg(F.min(\"_c2\")).collect()[0]['min(_c2)']\n",
    "        return df_pf\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        Analysis.generate_trace_view_list(self,id)\n",
    "        \n",
    "        df_pf=self.df\n",
    "\n",
    "        window = Window.partitionBy(\"_c1\").orderBy(\"_c3\")\n",
    "        df_pf=df_pf.withColumn(\"last_tsk_done\", F.lag('_c3', 1, None).over(window))\n",
    "        df_pf=df_pf.withColumn('last_tsk_done',F.coalesce('last_tsk_done','_c2'))\n",
    "        df_pf=df_pf.withColumn('last_tsk_done',F.when(F.col('_c2')>F.col('last_tsk_done'),F.col('_c2')).otherwise(F.col('last_tsk_done')) )\n",
    "        \n",
    "        trace_events=[]\n",
    "        \n",
    "        trace_events.extend(df_pf.select(\n",
    "            F.col('_c1').alias('tid'),\n",
    "            (F.col('last_tsk_done')-F.lit(self.starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.expr('_c3 - last_tsk_done  ').alias('dur'),\n",
    "            F.lit(id).alias('pid'),\n",
    "            F.lit('X').alias('ph'),\n",
    "            F.col('_c0').alias('name'),\n",
    "            F.struct(F.col('_id').alias('uuid')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        return trace_events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class dask_analysis_log(dask_analysis):\n",
    "    def __init__(self,dask_file,logs):\n",
    "        Analysis.__init__(self,dask_file)\n",
    "\n",
    "    def load_data(self):\n",
    "        rdds=sc.textFile(self.file)\n",
    "        df_pf=rdds.mapPartitions(split_dask).toDF()\n",
    "        df_pf=df_pf.withColumnRenamed('_1','_c0')\n",
    "        df_pf=df_pf.withColumnRenamed('_2','_c1')\n",
    "        df_pf=df_pf.withColumnRenamed('_3','_c2')\n",
    "        df_pf=df_pf.withColumnRenamed('_4','_c3')\n",
    "        df_pf=df_pf.withColumnRenamed('_5','_id')\n",
    "        \n",
    "        df_pf=df_pf.withColumn('_c1',F.split(F.col('_c1'),\":\")[2])\n",
    "        df_pf=df_pf.withColumn('_c3',df_pf._c3.astype(DoubleType())*1000) \n",
    "        df_pf=df_pf.withColumn('_c2',df_pf._c2.astype(DoubleType())*1000)\n",
    "        \n",
    "        df_pf.cache()\n",
    "        self.df=df_pf\n",
    "        self.starttime=df_pf.agg(F.min(\"_c2\")).collect()[0]['min(_c2)']\n",
    "        return df_pf\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        Analysis.generate_trace_view_list(self,id)\n",
    "        \n",
    "        df_pf=self.df\n",
    "\n",
    "        window = Window.partitionBy(\"_c1\").orderBy(\"_c3\")\n",
    "        df_pf=df_pf.withColumn(\"last_tsk_done\", F.lag('_c3', 1, None).over(window))\n",
    "        df_pf=df_pf.withColumn('last_tsk_done',F.coalesce('last_tsk_done','_c2'))\n",
    "        df_pf=df_pf.withColumn('last_tsk_done',F.when(F.col('_c2')>F.col('last_tsk_done'),F.col('_c2')).otherwise(F.col('last_tsk_done')) )\n",
    "        \n",
    "        trace_events=[]\n",
    "        \n",
    "        trace_events.extend(df_pf.select(\n",
    "            F.col('_c1').alias('tid'),\n",
    "            (F.col('last_tsk_done')-F.lit(self.starttime)).astype(IntegerType()).alias('ts'),\n",
    "            F.expr('_c3 - last_tsk_done  ').alias('dur'),\n",
    "            F.lit(id).alias('pid'),\n",
    "            F.lit('X').alias('ph'),\n",
    "            F.col('_c0').alias('name'),\n",
    "            F.struct(F.col('_id').alias('uuid')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "\n",
    "        return trace_events"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# instantevent analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "## format: _2 = Name; _3 = time\n",
    "\n",
    "class InstantEvent_analysis(Analysis):\n",
    "    def __init__(self,sar_file):\n",
    "        Analysis.__init__(self,sar_file)\n",
    "    \n",
    "    def load_data(self):\n",
    "        sardata=sc.textFile(self.file)\n",
    "        sardf=sardata.mapPartitions(splits).toDF()\n",
    "        self.df=sardf\n",
    "        return sardf\n",
    "\n",
    "\n",
    "    def generate_trace_view_list(self,id=0,**kwargs):\n",
    "        Analysis.generate_trace_view_list(self,id)\n",
    "        sardf=self.df\n",
    "        starttime=self.starttime\n",
    "        return sardf.select(F.lit(0).alias('tid'),\n",
    "                (F.col(\"_3\").astype(DoubleType())*1000-F.lit(starttime)).astype(IntegerType()).alias('ts'),\n",
    "                F.lit(0).alias('pid'),\n",
    "                F.lit('i').alias('ph'),\n",
    "                F.col(\"_2\").alias('name'),\n",
    "                F.lit(\"g\").alias(\"s\")\n",
    "            ).toJSON().collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# HBM_Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class HBM_analysis(Analysis):\n",
    "    def __init__(self,file):\n",
    "        Analysis.__init__(self,file)\n",
    "    \n",
    "    def load_data(self):\n",
    "        df=spark.read.option(\"delimiter\", \", \").option(\"header\", \"true\").csv(self.file)\n",
    "        self.df=df.withColumn(\"ts\", F.unix_timestamp(df.timestamp)).withColumn(\"size\", df.size.cast(LongType())).withColumn(\"free\", df.free.cast(LongType()))\n",
    "        return self.df\n",
    "\n",
    "    def generate_trace_view_list(self,id,**kwargs):\n",
    "        trace_list=Analysis.generate_trace_view_list(self,id,**kwargs)\n",
    "        hbmdf=self.df\n",
    "        starttime=self.starttime\n",
    "        \n",
    "        trace_list.extend(hbmdf.select(\n",
    "            F.lit(0).alias('tid'),\n",
    "            (F.col(\"ts\") * F.lit(1000)-F.lit(starttime)).astype(LongType()).alias('ts'),\n",
    "            F.lit(id).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.lit(\"hbm\").alias('name'),\n",
    "            F.struct((F.col(\"size\")-F.col(\"free\")).alias('hbmused'), F.col(\"free\").alias('hbmfree')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "        \n",
    "        trace_list.extend(hbmdf.select(\n",
    "            F.lit(0).alias('tid'),\n",
    "            (F.col(\"ts\") * F.lit(1000)-F.lit(starttime)).astype(LongType()).alias('ts'),\n",
    "            F.lit(id).alias('pid'),\n",
    "            F.lit('C').alias('ph'),\n",
    "            F.lit(\"hbm %\").alias('name'),\n",
    "            F.struct(((F.lit(1) - F.col(\"free\") / F.col(\"size\")) * F.lit(100)).alias('%hbmused')).alias('args')\n",
    "        ).toJSON().collect())\n",
    "        return trace_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Run base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Run:\n",
    "    def __init__(self,samples):\n",
    "        self.samples=samples\n",
    "    \n",
    "    def generate_trace_view(self,appid,**kwargs):\n",
    "        traces=[]\n",
    "        \n",
    "        for idx, s in enumerate(self.samples):\n",
    "            traces.extend(s.generate_trace_view_list(idx,**kwargs))        \n",
    "        output='''\n",
    "        {\n",
    "            \"traceEvents\": [\n",
    "        \n",
    "        ''' + \\\n",
    "        \",\\n\".join(traces)\\\n",
    "       + '''\n",
    "            ]\n",
    "        }'''\n",
    "\n",
    "        with open('/home/sparkuser/trace_result/'+appid+'.json', 'w') as outfile:  \n",
    "            outfile.write(output)\n",
    "\n",
    "        print(f\"http://{localhost}:1088/tracing_examples/trace_viewer.html#/tracing/test_data/{appid}.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Dask Application Run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Dask_Application_Run:\n",
    "    def __init__(self, appid):\n",
    "        self.appid=appid\n",
    "        self.filedir=\"/tmp/dgx-2Log/\"+self.appid+\"/\"\n",
    "        \n",
    "        self.analysis={\n",
    "            'dask':{'als':dask_analysis(self.filedir+\"cluster.log\"),'pid':8000},\n",
    "            'sar_cpu':{'als':Sar_cpu_analysis(self.filedir + \"/\"+\"sar_cpu.sar\"),'pid':10*0+0},\n",
    "            'sar_disk':{'als':Sar_disk_analysis(self.filedir + \"/\"+\"sar_disk.sar\"),'pid':10*0+1},\n",
    "            'sar_mem':{'als':Sar_mem_analysis(self.filedir + \"/\"+\"sar_mem.sar\"),'pid':10*0+2},\n",
    "            'sar_nic':{'als':Sar_nic_analysis(self.filedir  + \"/\"+\"sar_nic.sar\"),'pid':10*0+3},\n",
    "            'emon':{'als':Emon_Analysis(self.filedir +  \"/\"+\"emon.rst\"),'pid':10*0+4},\n",
    "            'gpu':{'als':gpu_analysis(self.filedir + \"/gpu.txt\"),'pid':10*0+5},\n",
    "        }\n",
    "        \n",
    "    \n",
    "    def generate_trace_view(self,showsar=True,showemon=False,showgpu=True,**kwargs):\n",
    "        traces=[]\n",
    "        daskals=self.analysis['dask']['als']\n",
    "        traces.extend(daskals.generate_trace_view_list(self.analysis['dask']['pid'],**kwargs))\n",
    "        if showsar:\n",
    "            sarals=self.analysis['sar_cpu']['als']\n",
    "            sarals.starttime=daskals.starttime\n",
    "            traces.extend(sarals.generate_trace_view_list(self.analysis['sar_cpu']['pid'],**kwargs))\n",
    "            sarals=self.analysis['sar_disk']['als']\n",
    "            sarals.starttime=daskals.starttime\n",
    "            traces.extend(sarals.generate_trace_view_list(self.analysis['sar_disk']['pid'],**kwargs))\n",
    "            sarals=self.analysis['sar_mem']['als']\n",
    "            sarals.starttime=daskals.starttime\n",
    "            traces.extend(sarals.generate_trace_view_list(self.analysis['sar_mem']['pid'],**kwargs))\n",
    "            sarals=self.analysis['sar_nic']['als']\n",
    "            sarals.starttime=daskals.starttime\n",
    "            traces.extend(sarals.generate_trace_view_list(self.analysis['sar_nic']['pid'],**kwargs))\n",
    "        if showemon:\n",
    "            emonals=self.analysis['emon']['als']\n",
    "            emonals.starttime=daskals.starttime\n",
    "            traces.extend(emonals.generate_trace_view_list(self.analysis['emon']['pid'],**kwargs))\n",
    "        if showgpu:\n",
    "            gpuals=self.analysis['gpu']['als']\n",
    "            gpuals.starttime=daskals.starttime\n",
    "            traces.extend(gpuals.generate_trace_view_list(self.analysis['gpu']['pid'],**kwargs))\n",
    "        \n",
    "        output='''\n",
    "        {\n",
    "            \"traceEvents\": [\n",
    "        \n",
    "        ''' + \\\n",
    "        \",\\n\".join(traces)\\\n",
    "       + '''\n",
    "            ]\n",
    "        }'''\n",
    "\n",
    "        with open('/home/sparkuser/trace_result/'+self.appid+'.json', 'w') as outfile:  \n",
    "            outfile.write(output)\n",
    "\n",
    "        print(\"http://sr219:1088/tracing_examples/trace_viewer.html#/tracing/test_data/\"+self.appid+\".json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "datetime.fromtimestamp(1546439400)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Dask_Application_Run2:\n",
    "    def __init__(self, appid):\n",
    "        self.appid=appid\n",
    "        \n",
    "        self.filedir=\"/tmp/dgx-2Log/\"+self.appid+\"/\"\n",
    "        self.dask=self.load_dask()\n",
    "        self.sar=self.load_sar()\n",
    "        self.gpu=self.load_gpu()\n",
    "        \n",
    "    \n",
    "    def load_dask(self):\n",
    "        return dask_analysis(self.filedir+\"cluster.log\")\n",
    "    \n",
    "    def load_sar(self):\n",
    "        return Sar_analysis(self.filedir+\"sar_data.sar\")\n",
    "        \n",
    "    def load_emon(self):\n",
    "        return Emon_Analysis(self.filedir+\"emon.rst\")\n",
    "    \n",
    "    def load_gpu(self):\n",
    "        return gpu_dmon_analysis(self.filedir+\"gpu_dmon.txt\")\n",
    "    \n",
    "    def generate_dask_trace_view(self):\n",
    "        return self.dask.generate_dask_trace_view(8000)\n",
    "    \n",
    "    def generate_sar_trace_view(self):\n",
    "        return self.sar.generate_sar_trace_view(0)\n",
    "    \n",
    "    def generate_gpu_trace_view(self):\n",
    "        return self.gpu.generate_gpu_trace_view(1)\n",
    "\n",
    "    def generate_emon_trace_view(self,collected_cores):\n",
    "        return self.emon.generate_emon_trace_view(5,collected_cores)\n",
    "    \n",
    "    def generate_trace_view(self,showsar=True,showemon=False,showgpu=True):\n",
    "        traces=[]\n",
    "        traces.extend(self.generate_dask_trace_view())\n",
    "        if showsar:\n",
    "            self.sar.starttime=self.dask.starttime\n",
    "            traces.extend(self.generate_sar_trace_view())\n",
    "        if showemon:\n",
    "            traces.extend(self.generate_emon_trace_view(collected_cores))\n",
    "        if showgpu:\n",
    "            self.gpu.starttime=self.dask.starttime\n",
    "            traces.extend(self.generate_gpu_trace_view())\n",
    "        \n",
    "        output='''\n",
    "        {\n",
    "            \"traceEvents\": [\n",
    "        \n",
    "        ''' + \\\n",
    "        \",\\n\".join(traces)\\\n",
    "       + '''\n",
    "            ]\n",
    "        }'''\n",
    "\n",
    "        with open('/home/sparkuser/trace_result/'+self.appid+'.json', 'w') as outfile:  \n",
    "            outfile.write(output)\n",
    "\n",
    "        print(f\"http://{localhost}:1088/tracing_examples/trace_viewer.html#/tracing/test_data/{appid}.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Application RUN STD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Application_Run_STD:\n",
    "    def __init__(self, appid):\n",
    "        self.appid=appid\n",
    "        self.filedir=\"/tmp/dgx-2Log/\"+self.appid+\"/\"\n",
    "        \n",
    "        self.analysis={\n",
    "            'sar':{'als':Sar_analysis(self.filedir+\"sar_data.sar\"),'pid':0},\n",
    "            'emon':{'als':Emon_Analysis(self.filedir+\"emon.rst\"),'pid':1},\n",
    "            'gpu':{'als':gpu_analysis(self.filedir+\"gpu.txt\"),'pid':100},\n",
    "        }\n",
    "        \n",
    "    \n",
    "    def generate_trace_view(self,showsar=True,showemon=False,showgpu=True,**kwargs):\n",
    "        traces=[]\n",
    "        starttime=time.time()*1000\n",
    "        if showsar:\n",
    "            sarals=self.analysis['sar']['als']\n",
    "            sarals.starttime=starttime\n",
    "            traces.extend(sarals.generate_trace_view_list(self.analysis['sar']['pid'],**kwargs))\n",
    "        if showemon:\n",
    "            emonals=self.analysis['emon']['als']\n",
    "            emonals.starttime=starttime\n",
    "            traces.extend(emonals.generate_trace_view_list(self.analysis['emon']['pid'],**kwargs))\n",
    "        if showgpu:\n",
    "            gpuals=self.analysis['gpu']['als']\n",
    "            gpuals.starttime=starttime\n",
    "            traces.extend(gpuals.generate_trace_view_list(self.analysis['gpu']['pid'],**kwargs))\n",
    "        \n",
    "        output='''\n",
    "        {\n",
    "            \"traceEvents\": [\n",
    "        \n",
    "        ''' + \\\n",
    "        \",\\n\".join(traces)\\\n",
    "       + '''\n",
    "            ]\n",
    "        }'''\n",
    "\n",
    "        with open('/home/sparkuser/trace_result/'+self.appid+'.json', 'w') as outfile:  \n",
    "            outfile.write(output)\n",
    "\n",
    "        print(f\"http://{localhost}:1088/tracing_examples/trace_viewer.html#/tracing/test_data/{appid}.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Application Run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "class Application_Run:\n",
    "    def __init__(self, appid,**kwargs):\n",
    "        self.appid=appid\n",
    "        \n",
    "        basedir=kwargs.get(\"basedir\",\"skylake\")\n",
    "        self.filedir=\"/\"+basedir+\"/\"+self.appid+\"/\"\n",
    "        self.basedir=basedir\n",
    "        \n",
    "        slaves=fs.list_status(\"/\"+basedir+\"/\"+appid)\n",
    "        slaves=[f['pathSuffix'] for f in slaves if f['type']=='DIRECTORY' and f['pathSuffix']!=\"summary.parquet\"]\n",
    "        \n",
    "        jobids=kwargs.get(\"jobids\",None)\n",
    "        \n",
    "        self.clients=slaves\n",
    "        \n",
    "        sarclnt={}\n",
    "        for idx,l in enumerate(self.clients):\n",
    "            sarclnt[l]={'sar_cpu':{'als':Sar_cpu_analysis(self.filedir + l + \"/\"+\"sar_cpu.sar\"),'pid':idx},\n",
    "                'sar_disk':{'als':Sar_disk_analysis(self.filedir + l + \"/\"+\"sar_disk.sar\"),'pid':idx},\n",
    "                'sar_mem':{'als':Sar_mem_analysis(self.filedir + l + \"/\"+\"sar_mem.sar\"),'pid':idx},\n",
    "                'sar_nic':{'als':Sar_nic_analysis(self.filedir + l + \"/\"+\"sar_nic.sar\"),'pid':idx}\n",
    "            }\n",
    "            if fs.exists(self.filedir + l + \"/sar_page.sar\"):\n",
    "                sarclnt[l]['sar_page']={'als':Sar_PageCache_analysis(self.filedir + l + \"/\"+\"sar_page.sar\"),'pid':idx}\n",
    "            \n",
    "            if fs.exists(self.filedir + l + \"/pidstat.out\"):\n",
    "                sarclnt[l]['sar_pid']={'als':Pidstat_analysis(self.filedir + l + \"/pidstat.out\"),'pid':idx}\n",
    "            if fs.exists(self.filedir + l + \"/sched.txt\"):\n",
    "                sarclnt[l]['sar_perf']={'als':Perf_trace_analysis(self.filedir + l + \"/sched.txt\"),'pid':100+idx}\n",
    "            if fs.exists(self.filedir + l + \"/emon.rst\"):\n",
    "                self.show_emon=True\n",
    "                sarclnt[l]['emon']={'als':Emon_Analysis(self.filedir + l + \"/emon.rst\"),'pid':200+idx}\n",
    "            if fs.exists(self.filedir + l + \"/perfstat.txt\"):\n",
    "                self.show_perfstat=True\n",
    "                sarclnt[l]['perfstat']={'als':Perfstat_analysis(self.filedir + l + \"/perfstat.txt\"),'pid':300+idx}\n",
    "            if fs.exists(self.filedir + l + \"/gpu.txt\"):\n",
    "                sarclnt[l]['gpu']={'als':gpu_analysis(self.filedir + l + \"/gpu.txt\"),'pid':400+idx}\n",
    "            \n",
    "                \n",
    "        self.analysis={\n",
    "            \"sar\": sarclnt\n",
    "        }\n",
    "        \n",
    "        if fs.exists(self.filedir+\"app.log\"):\n",
    "            self.analysis['app']={'als':App_Log_Analysis(self.filedir+\"app.log\",jobids)}\n",
    "        \n",
    "        if fs.exists(self.filedir+\"instevent.out\"):\n",
    "            self.analysis['instant']={'als':InstantEvent_analysis(self.filedir+\"instevent.out\")}\n",
    "        \n",
    "        self.starttime=0\n",
    "        if fs.exists(self.filedir+\"starttime\"):\n",
    "            with fs.open(self.filedir+\"starttime\") as f:\n",
    "                st = f.read().decode('ascii')\n",
    "                self.starttime=int(st)\n",
    "    \n",
    "    def generate_trace_view(self,showsar=True,showgpu=True,showhbm=False,**kwargs):\n",
    "        traces=[]\n",
    "        shownodes=kwargs.get(\"shownodes\",self.clients)\n",
    "        for l in shownodes:\n",
    "            if l not in self.clients:\n",
    "                print(l,\"is not in clients\",self.clients)\n",
    "                return\n",
    "        self.clients=shownodes\n",
    "        \n",
    "        xgbtcks=kwargs.get('xgbtcks',(\"calltrain\",'enter','begin','end'))\n",
    "        \n",
    "        if \"app\" in self.analysis:\n",
    "            appals=self.analysis['app']['als']\n",
    "            appals.starttime=self.starttime\n",
    "            traces.extend(appals.generate_trace_view_list(self.analysis['app'],**kwargs))\n",
    "            self.starttime=appals.starttime\n",
    "        \n",
    "        if 'instant' in self.analysis:\n",
    "            als=self.analysis['instant']['als']\n",
    "            als.starttime=self.starttime\n",
    "            traces.extend(als.generate_trace_view_list(**kwargs))\n",
    "        \n",
    "        counttime=kwargs.get(\"counttime\",False)\n",
    "        \n",
    "        pidmap={}\n",
    "        if showsar:\n",
    "            for l in self.clients:\n",
    "                for alskey, sarals in self.analysis[\"sar\"][l].items():\n",
    "                    t1 = time.time()\n",
    "                    if alskey!=\"emon\":\n",
    "                        sarals['als'].starttime=self.starttime\n",
    "                        traces.extend(sarals['als'].generate_trace_view_list(sarals['pid'],node=l, **kwargs))\n",
    "                    elif self.show_emon:\n",
    "                        sarals['als'].load_data()\n",
    "                        pidmap[l]=sarals['pid']\n",
    "                    if counttime:\n",
    "                        print(l,alskey,\" spend time: \", time.time()-t1)\n",
    "        if self.show_emon:\n",
    "            t1 = time.time()\n",
    "            emondfs=get_emon_parquets([self.appid,],self.basedir)\n",
    "            emons=Emon_Analysis_All(emondfs)\n",
    "            emons.starttime=self.starttime\n",
    "            traces.extend(emons.generate_trace_view_list(0,pidmap=pidmap,**kwargs))\n",
    "            if counttime:\n",
    "                print(\"emon process spend time: \", time.time()-t1)\n",
    "            self.emons=emons\n",
    "        \n",
    "        if showhbm:\n",
    "            for l in self.clients:\n",
    "                t1 = time.time()\n",
    "                hbm_analysis=HBM_analysis(self.filedir + l + \"/numactl.csv\")\n",
    "                hbm_analysis.starttime=self.starttime\n",
    "                traces.extend(hbm_analysis.generate_trace_view_list(0,**kwargs))\n",
    "                if counttime:\n",
    "                    print(l, \" hbm process spend time: \", time.time()-t1)\n",
    "        \n",
    "        for idx,l in enumerate(self.clients):\n",
    "            traces.append(json.dumps({\"name\": \"process_sort_index\",\"ph\": \"M\",\"pid\":idx,\"tid\":0,\"args\":{\"sort_index \":idx}}))\n",
    "            traces.append(json.dumps({\"name\": \"process_sort_index\",\"ph\": \"M\",\"pid\":idx+100,\"tid\":0,\"args\":{\"sort_index \":idx+100}}))\n",
    "            traces.append(json.dumps({\"name\": \"process_sort_index\",\"ph\": \"M\",\"pid\":idx+200,\"tid\":0,\"args\":{\"sort_index \":idx+200}}))\n",
    "        \n",
    "        if \"app\" in self.analysis:\n",
    "            for pid in self.analysis['app']['als'].pids:\n",
    "                traces.append(json.dumps({\"name\": \"process_sort_index\",\"ph\": \"M\",\"pid\":pid+200,\"tid\":0,\"args\":{\"sort_index \":pid+200}}))\n",
    "\n",
    "        allcnt=\"\"\n",
    "        for c in self.clients:\n",
    "            paths=self.filedir+c\n",
    "            if fs.exists(paths+\"/xgbtck.txt\"):\n",
    "                with fs.open(paths+\"/xgbtck.txt\") as f:\n",
    "                    tmp = f.read().decode('ascii')\n",
    "                    allcnt=allcnt+tmp\n",
    "        allcnt=allcnt.strip().split(\"\\n\")\n",
    "        if len(allcnt) > 1:\n",
    "            allcnt=[l.split(\" \") for l in allcnt]\n",
    "            cnts=pandas.DataFrame([[l[0],l[1],l[2],l[3]] for l in allcnt if len(l)>1 and l[1] in xgbtcks])\n",
    "            if len(cnts) > 0:\n",
    "                cnts.columns=['xgbtck','name','rank','time']\n",
    "                cntgs=cnts.groupby(\"name\").agg({\"time\":\"min\"})\n",
    "                cntgs=cntgs.reset_index()\n",
    "                cntgs.columns=['name','ts']\n",
    "                cntgs['ph']=\"i\"\n",
    "                cntgs['ts']=pandas.to_numeric(cntgs['ts'])-self.starttime\n",
    "                cntgs['pid']=0\n",
    "                cntgs['tid']=0\n",
    "                cntgs['s']='g'\n",
    "                traces.extend([json.dumps(l) for l in cntgs.to_dict(orient='records')])\n",
    "        \n",
    "        output='''\n",
    "        {\n",
    "            \"traceEvents\": [\n",
    "        \n",
    "        ''' + \\\n",
    "        \",\\n\".join(traces)\\\n",
    "       + '''\n",
    "            ],\n",
    "            \"displayTimeUnit\": \"ns\"\n",
    "        }'''\n",
    "\n",
    "        with open('/home/sparkuser/trace_result/'+self.appid+'.json', 'w') as outfile:  \n",
    "            outfile.write(output)\n",
    "        \n",
    "        traceview_link=f'http://{local_ip}:1088/tracing_examples/trace_viewer.html#/tracing/test_data/{self.appid}.json'\n",
    "        display(HTML(f\"<a href={traceview_link}>{traceview_link}</a>\"))\n",
    "        return traceview_link\n",
    "\n",
    "    def getemonmetric(app,**kwargs):\n",
    "        emondfs=get_emon_parquets([app.appid],app.basedir)\n",
    "        emons=Emon_Analysis_All(emondfs)\n",
    "        metric_msg_map={\n",
    "            'emon_instr_retired':F.sum\n",
    "        }\n",
    "        \n",
    "        emonmetric=kwargs.get(\"show_metric\",None)\n",
    "\n",
    "        outdf=None\n",
    "        for k in emonmetric:\n",
    "            m=emons.emon_metrics[k]\n",
    "            for fk,fm in m['formula'].items():\n",
    "                if k not in metric_msg_map:\n",
    "                    metric_msg_map[k]=F.avg\n",
    "                df=emons.gen_reduce_metric(k,list(range(0,emons.totalcores)),fk,metric_msg_map[k])\n",
    "                tmpdf=df.groupBy(\"appid\",'client').agg(*[l(\"`{:s}`\".format(fk)).alias(get_alias_name(fk,l)) for l in [metric_msg_map[k]]]).toPandas()\n",
    "                tmpdf=tmpdf.set_index(\"client\").drop(columns=['appid']).T\n",
    "                if outdf is None:\n",
    "                    outdf=tmpdf\n",
    "                else:\n",
    "                    outdf=outdf.append(tmpdf)\n",
    "        pandas.options.display.float_format = '{:,.2f}'.format\n",
    "        return outdf\n",
    "    \n",
    "    def get_sar_stat(app,**kwargs):\n",
    "        disk_prefix=kwargs.get(\"disk_prefix\",\"dev259\")\n",
    "        nic_prefix = kwargs.get(\"nic_prefix\",[\"'eth3'\",\"'enp24s0f1'\"])\n",
    "        cpustat=[app.analysis[\"sar\"][l]['sar_cpu']['als'].get_stat() for l in app.clients]\n",
    "        cpustat=reduce(lambda l,r:l.join(r),cpustat)\n",
    "        diskstat=[app.analysis[\"sar\"][l]['sar_disk']['als'].get_stat(disk_prefix=disk_prefix) for l in app.clients]\n",
    "        diskstat=reduce(lambda l,r:l.join(r),diskstat)\n",
    "        memstat=[app.analysis[\"sar\"][l]['sar_mem']['als'].get_stat() for l in app.clients]\n",
    "        memstat=reduce(lambda l,r:l.join(r),memstat)\n",
    "        nicstat=[app.analysis[\"sar\"][l]['sar_nic']['als'].get_stat(nic_prefix=nic_prefix) for l in app.clients]\n",
    "        nicstat=reduce(lambda l,r:l.join(r),nicstat)\n",
    "        pagestat=[app.analysis[\"sar\"][l]['sar_page']['als'].get_stat() for l in app.clients]\n",
    "        pagestat=reduce(lambda l,r:l.join(r),pagestat)\n",
    "        pandas.options.display.float_format = '{:,.2f}'.format\n",
    "        return pandas.concat([cpustat,diskstat,memstat,nicstat,pagestat])\n",
    "        \n",
    "    def get_perf_stat(self, **kwargs):\n",
    "        perfstat=[self.analysis[\"sar\"][l]['perfstat']['als'].get_stat() for l in self.clients]\n",
    "        return reduce(lambda l,r: l.join(r), perfstat)\n",
    "        \n",
    "    def get_summary(app, **kwargs):\n",
    "        output=[]\n",
    "        \n",
    "        appals=app.analysis[\"app\"][\"als\"]\n",
    "        \n",
    "        out=appals.get_query_time(plot=False)\n",
    "        \n",
    "        lrun=app.appid\n",
    "        \n",
    "        cmpcolumns=['runtime','disk spilled','shuffle_write','f_wait_time','input read','acc_task_time','output rows']\n",
    "        outcut=out[cmpcolumns]\n",
    "        \n",
    "        pdsout=pandas.DataFrame(outcut.sum(),columns=[lrun])\n",
    "        pdstime=pdsout  \n",
    "\n",
    "        if app.show_emon:\n",
    "            emondf=app.getemonmetric(**kwargs)\n",
    "            def get_agg(emondf):\n",
    "                aggs=[]\n",
    "                for x in emondf.index:\n",
    "                    if x.endswith(\"avg\"):\n",
    "                        aggs.append(emondf.loc[x].mean())\n",
    "                    else:\n",
    "                        aggs.append(emondf.loc[x].sum())\n",
    "\n",
    "                emondf['agg']=aggs\n",
    "                return emondf\n",
    "            emondf=get_agg(emondf)\n",
    "\n",
    "            emonsum=emondf[[\"agg\"]]\n",
    "\n",
    "            emonsum.columns=[lrun]\n",
    "\n",
    "        print(\"sar metric\")\n",
    "        sardf=app.get_sar_stat(**kwargs)\n",
    "        \n",
    "        def get_sar_agg(sardf):\n",
    "            aggs=[]\n",
    "            for x in sardf.index:\n",
    "                if \"total\" in x:\n",
    "                    aggs.append(sardf.loc[x].sum())\n",
    "                elif \"max\" in x:\n",
    "                    aggs.append(sardf.loc[x].max())\n",
    "                else:\n",
    "                    aggs.append(sardf.loc[x].mean())\n",
    "\n",
    "            sardf['agg']=aggs\n",
    "            return sardf\n",
    "        sardf=get_sar_agg(sardf)\n",
    "\n",
    "        sarsum=sardf[[\"agg\"]]\n",
    "\n",
    "        sarsum.columns=[lrun]\n",
    "        \n",
    "        summary=pandas.concat([pdstime,sarsum])\n",
    "        if app.show_emon:\n",
    "            summary=pandas.concat([summary,emonsum])\n",
    "        elif app.show_perfstat:\n",
    "            print(\"perf stat metric\")\n",
    "            perf_stat = app.get_perf_stat(**kwargs)\n",
    "            perf_stat = get_sar_agg(perf_stat)[['agg']]\n",
    "            perf_stat.columns=[lrun]\n",
    "            summary=pandas.concat([summary,perf_stat])\n",
    "            \n",
    "        df_sum=spark.createDataFrame(summary.T.reset_index())\n",
    "        for c in df_sum.columns:\n",
    "            df_sum=df_sum.withColumnRenamed(c,c.replace(\" \",\"_\").replace(\"(\",\"\").replace(\")\",\"\"))\n",
    "        df_sum.write.mode(\"overwrite\").parquet(app.filedir+\"summary.parquet\")\n",
    "        \n",
    "        return summary\n",
    "    \n",
    "    def compare_app(app2,**kwargs):\n",
    "        output=[]\n",
    "        \n",
    "        lbasedir=kwargs.get(\"basedir\",app2.basedir)\n",
    "        r_appid=kwargs.get(\"r_appid\",app2.appid)\n",
    "        \n",
    "        app=kwargs.get(\"rapp\",Application_Run(r_appid,basedir=lbasedir))\n",
    "\n",
    "        show_queryplan_diff=kwargs.get(\"show_queryplan_diff\",True)\n",
    "        \n",
    "        queryids=kwargs.get(\"queryids\",None)\n",
    "        \n",
    "        appals=app.analysis[\"app\"][\"als\"]\n",
    "        appals2=app2.analysis[\"app\"][\"als\"]\n",
    "\n",
    "        out=appals.get_query_time(plot=False)\n",
    "        out2=appals2.get_query_time(plot=False)\n",
    "\n",
    "        lrun=app.appid\n",
    "        rrun=app2.appid\n",
    "        cmpcolumns=['runtime','shuffle_write','f_wait_time','input read','acc_task_time','output rows']\n",
    "        outcut=out[cmpcolumns]\n",
    "        out2cut=out2[cmpcolumns]\n",
    "        cmp=outcut.join(out2cut,lsuffix='_'+lrun,rsuffix='_'+rrun)\n",
    "\n",
    "        pdsout=pandas.DataFrame(outcut.sum(),columns=[lrun])\n",
    "        pdsout2=pandas.DataFrame(out2cut.sum(),columns=[rrun])\n",
    "        pdstime=pdsout.join(pdsout2)\n",
    "\n",
    "        showemon=app.show_emon and app2.show_emon\n",
    "        if showemon:\n",
    "            print(\"emon metric\")\n",
    "\n",
    "            emondf=app.getemonmetric(**kwargs)\n",
    "            emondf2=app2.getemonmetric(**kwargs)\n",
    "            #in case we comare with two clsuter\n",
    "            emondf.columns=emondf2.columns\n",
    "            def get_agg(emondf):\n",
    "                aggs=[]\n",
    "                for x in emondf.index:\n",
    "                    if x.endswith(\"avg\"):\n",
    "                        aggs.append(emondf.loc[x].mean())\n",
    "                    else:\n",
    "                        aggs.append(emondf.loc[x].sum())\n",
    "\n",
    "                emondf['agg']=aggs\n",
    "                return emondf\n",
    "            emondf=get_agg(emondf)\n",
    "            emondf2=get_agg(emondf2)\n",
    "\n",
    "            emoncolumns=emondf.columns\n",
    "            emoncmp=emondf.join(emondf2,lsuffix='_'+lrun,rsuffix='_'+rrun)\n",
    "            emonsum=emoncmp[[\"agg_\"+lrun,\"agg_\"+rrun]]\n",
    "\n",
    "            emonsum.columns=[lrun,rrun]\n",
    "\n",
    "        print(\"sar metric\")\n",
    "        sardf=app.get_sar_stat(**kwargs)\n",
    "        sardf2=app2.get_sar_stat(**kwargs)\n",
    "        \n",
    "        def get_sar_agg(sardf):\n",
    "            aggs=[]\n",
    "            for x in sardf.index:\n",
    "                if \"total\" in x:\n",
    "                    aggs.append(sardf.loc[x].sum())\n",
    "                elif \"max\" in x:\n",
    "                    aggs.append(sardf.loc[x].max())\n",
    "                else:\n",
    "                    aggs.append(sardf.loc[x].mean())\n",
    "\n",
    "            sardf['agg']=aggs\n",
    "            return sardf\n",
    "        sardf=get_sar_agg(sardf)\n",
    "        sardf2=get_sar_agg(sardf2)\n",
    "        #in case we compare two clusters\n",
    "        sardf2.columns=sardf.columns\n",
    "\n",
    "        sarcolumns=sardf.columns\n",
    "        sarcmp=sardf.join(sardf2,lsuffix='_'+lrun,rsuffix='_'+rrun)\n",
    "        sarsum=sarcmp[[\"agg_\"+lrun,\"agg_\"+rrun]]\n",
    "\n",
    "        sarsum.columns=[lrun,rrun]\n",
    "        \n",
    "        summary=pandas.concat([pdstime,sarsum])\n",
    "        if showemon:\n",
    "            summary=pandas.concat([summary,emonsum])\n",
    "            \n",
    "        summary[\"diff\"]=numpy.where(summary[rrun] > 0, summary[lrun]/summary[rrun]-1, 0)\n",
    "        \n",
    "        \n",
    "        def highlight_diff(x):\n",
    "            styles=[]\n",
    "            mx=x.max()\n",
    "            mn=x.min()\n",
    "            mx=max(mx,-mn,0.2)\n",
    "            for j in x.index:\n",
    "                m1=(x[j])/mx*100 if x[j]!=None else 0\n",
    "                if m1>0:\n",
    "                    styles.append(f'width: 400px ; background-image: linear-gradient(to right, transparent 50%, #5fba7d 50%, #5fba7d {50+m1/2}%, transparent {50+m1/2}%)')\n",
    "                else:\n",
    "                    styles.append(f'width: 400px ;background-image: linear-gradient(to left, transparent 50%, #f1a863 50%, #f1a863 {50-m1/2}%, transparent {50-m1/2}%)')\n",
    "            return styles\n",
    "\n",
    "        output.append(summary.style.apply(highlight_diff,subset=['diff']).format({lrun:\"{:,.2f}\",rrun:\"{:,.2f}\",'diff':\"{:,.2%}\"}).render())\n",
    "\n",
    "        cmp_plot=cmp\n",
    "        cmp_plot['diff']=cmp_plot['runtime_'+lrun]-cmp_plot['runtime_'+rrun]\n",
    "\n",
    "        pltx=cmp_plot.sort_values(by='diff',axis=0).plot.bar(y=['runtime_'+lrun,'runtime_'+rrun],figsize=(30,8))\n",
    "        better_num=sqldf('''select count(*) from cmp_plot where diff>0''')['count(*)'][0]\n",
    "        pltx.text(0.1, 0.8,'{:d} queries are better'.format(better_num), ha='center', va='center', transform=pltx.transAxes)\n",
    "\n",
    "        df1 = pandas.DataFrame('', index=cmp.index, columns=cmpcolumns)\n",
    "        for l in cmpcolumns:\n",
    "            for j in cmp.index:\n",
    "                df1[l][j]=[cmp[l+\"_\"+lrun][j],cmp[l+\"_\"+rrun][j],cmp[l+\"_\"+lrun][j]/cmp[l+\"_\"+rrun][j]-1]\n",
    "\n",
    "        def highlight_greater(x,columns):\n",
    "            df1 = pandas.DataFrame('', index=x.index, columns=x.columns)\n",
    "            for l in columns:\n",
    "                m={}\n",
    "                for j in x.index:\n",
    "                    m[j] = (x[l][j][1] / x[l][j][0])*100 if x[l][j][0]!=0 else 100\n",
    "                mx=max(m.values())-100\n",
    "                mn=100-min(m.values())\n",
    "                mx=max(mx,mn)\n",
    "                for j in x.index:\n",
    "                    m1=-(100-m[j])/mx*100 if x[l][j][0]!=0 else 0\n",
    "                    if m1>0:\n",
    "                        df1[l][j] = f'background-image: linear-gradient(to right, transparent 50%, #5fba7d 50%, #5fba7d {50+m1/2}%, transparent {50+m1/2}%)'\n",
    "                    else:\n",
    "                        df1[l][j] = f'background-image: linear-gradient(to left, transparent 50%, #f1a863 50%, #f1a863 {50-m1/2}%, transparent {50-m1/2}%)'\n",
    "\n",
    "            return df1\n",
    "\n",
    "        def display_compare(df,columns):\n",
    "            output.append(df.style.set_properties(**{'width': '300px','border-style':'solid','border-width':'1px'}).apply(lambda x: highlight_greater(x,columns), axis=None).format(lambda x: '''\n",
    "                                                                          <div style='max-width: 30%; min-width:30%;display:inline-block;'>{:,.2f}</div>\n",
    "                                                                          <div style='max-width: 30%; min-width:30%; display:inline-block;'>{:,.2f}</div>\n",
    "                                                                          <div style='max-width: 30%; min-width:30%; display:inline-block;color:blue'>{:,.2f}%</div>\n",
    "                                                                       '''.format(x[0],x[1],x[2]*100)).render())\n",
    "        display_compare(df1,cmpcolumns)\n",
    "\n",
    "        df3 = pandas.DataFrame('', index=sarcmp.index, columns=sarcolumns)\n",
    "        for l in sarcolumns:\n",
    "            for j in df3.index:\n",
    "                df3[l][j]=[sarcmp[l+\"_\"+lrun][j],sarcmp[l+\"_\"+rrun][j],sarcmp[l+\"_\"+lrun][j]/sarcmp[l+\"_\"+rrun][j]-1]\n",
    "        display_compare(df3,sarcolumns)\n",
    "\n",
    "        if showemon:\n",
    "            df2 = pandas.DataFrame('', index=emoncmp.index, columns=emoncolumns)\n",
    "            for l in emoncolumns:\n",
    "                for j in df2.index:\n",
    "                    df2[l][j]=[emoncmp[l+\"_\"+lrun][j],emoncmp[l+\"_\"+rrun][j],emoncmp[l+\"_\"+lrun][j]/emoncmp[l+\"_\"+rrun][j]-1]\n",
    "            display_compare(df2,emoncolumns)\n",
    "\n",
    "        print(\"time breakdown\")\n",
    "        ################################ time breakdown ##################################################################################################\n",
    "        timel=appals.show_time_metric(plot=False)\n",
    "        timer=appals2.show_time_metric(plot=False)\n",
    "        timer.columns=[l.replace(\"scan time\",\"time_batchscan\") for l in timer.columns]\n",
    "        timel.columns=[l.replace(\"scan time\",\"time_batchscan\") for l in timel.columns]\n",
    "        rcols=timer.columns\n",
    "        lcols=[]\n",
    "        for c in [l.split(\"%\")[1][1:] for l in rcols]:\n",
    "            for t in timel.columns:\n",
    "                if t.endswith(c):\n",
    "                    lcols.append(t)\n",
    "        for t in timel.columns:\n",
    "            if t not in lcols:\n",
    "                lcols.append(t)\n",
    "        timel_adj=timel[lcols]\n",
    "\n",
    "        fig, axs = plt.subplots(nrows=1, ncols=2, sharey=True,figsize=(30,8),gridspec_kw = {'width_ratios':[1, 1]})\n",
    "        plt.subplots_adjust(wspace=0.01)\n",
    "        ax=timel_adj.plot.bar(ax=axs[0],stacked=True)\n",
    "        list_values=timel_adj.loc[0].values\n",
    "        for rect, value in zip(ax.patches, list_values):\n",
    "            h = rect.get_height() /2.\n",
    "            w = rect.get_width() /2.\n",
    "            x, y = rect.get_xy()\n",
    "            ax.text(x+w, y+h,\"{:,.2f}\".format(value),horizontalalignment='center',verticalalignment='center',color=\"white\")\n",
    "        ax=timer.plot.bar(ax=axs[1],stacked=True)\n",
    "        list_values=timer.loc[0].values\n",
    "        for rect, value in zip(ax.patches, list_values):\n",
    "            h = rect.get_height() /2.\n",
    "            w = rect.get_width() /2.\n",
    "            x, y = rect.get_xy()\n",
    "            ax.text(x+w, y+h,\"{:,.2f}\".format(value),horizontalalignment='center',verticalalignment='center',color=\"white\")\n",
    "\n",
    "################################ critical time breakdown ##################################################################################################\n",
    "        timel=appals.show_time_metric(plot=False,taskids=[l[0].item() for l in appals.criticaltasks])\n",
    "        timer=appals2.show_time_metric(plot=False,taskids=[l[0].item() for l in appals2.criticaltasks])\n",
    "        timer.columns=[l.replace(\"scan time\",\"time_batchscan\") for l in timer.columns]\n",
    "        timel.columns=[l.replace(\"scan time\",\"time_batchscan\") for l in timel.columns]\n",
    "        rcols=timer.columns\n",
    "        lcols=[]\n",
    "        for c in [l.split(\"%\")[1][1:] for l in rcols]:\n",
    "            for t in timel.columns:\n",
    "                if t.endswith(c):\n",
    "                    lcols.append(t)\n",
    "        for t in timel.columns:\n",
    "            if t not in lcols:\n",
    "                lcols.append(t)\n",
    "        timel_adj=timel[lcols]\n",
    "\n",
    "        fig, axs = plt.subplots(nrows=1, ncols=2, sharey=True,figsize=(30,8),gridspec_kw = {'width_ratios':[1, 1]})\n",
    "        plt.subplots_adjust(wspace=0.01)\n",
    "        ax=timel_adj.plot.bar(ax=axs[0],stacked=True)\n",
    "        list_values=timel_adj.loc[0].values\n",
    "        for rect, value in zip(ax.patches, list_values):\n",
    "            h = rect.get_height() /2.\n",
    "            w = rect.get_width() /2.\n",
    "            x, y = rect.get_xy()\n",
    "            ax.text(x+w, y+h,\"{:,.2f}\".format(value),horizontalalignment='center',verticalalignment='center',color=\"white\")\n",
    "        ax=timer.plot.bar(ax=axs[1],stacked=True)\n",
    "        list_values=timer.loc[0].values\n",
    "        for rect, value in zip(ax.patches, list_values):\n",
    "            h = rect.get_height() /2.\n",
    "            w = rect.get_width() /2.\n",
    "            x, y = rect.get_xy()\n",
    "            ax.text(x+w, y+h,\"{:,.2f}\".format(value),horizontalalignment='center',verticalalignment='center',color=\"white\")\n",
    "\n",
    "\n",
    "        ################################ hot stage ##########################################################################################################\n",
    "\n",
    "        hotstagel=appals.get_hottest_stages(plot=False)\n",
    "        hotstager=appals2.get_hottest_stages(plot=False)\n",
    "        hotstagel.style.format(lambda x: '''{:,.2f}'''.format(x))\n",
    "\n",
    "        norm = matplotlib.colors.Normalize(vmin=0, vmax=max(hotstager.queryid))\n",
    "        cmap = matplotlib.cm.get_cmap('brg')\n",
    "        def setbkcolor(x):\n",
    "            rgba=cmap(norm(x['queryid']))\n",
    "            return ['background-color:rgba({:d},{:d},{:d},1); color:white'.format(int(rgba[0]*255),int(rgba[1]*255),int(rgba[2]*255))]*9\n",
    "\n",
    "        output.append(\"<table><tr><td>\" + hotstagel.style.apply(setbkcolor,axis=1).format({\"total_time\":lambda x: '{:,.2f}'.format(x),\"stdev_time\":lambda x: '{:,.2f}'.format(x),\"acc_total\":lambda x: '{:,.2%}'.format(x),\"total\":lambda x: '{:,.2%}'.format(x)}).render()+\n",
    "             \"</td><td>\" +  hotstager.style.apply(setbkcolor,axis=1).format({\"total_time\":lambda x: '{:,.2f}'.format(x),\"stdev_time\":lambda x: '{:,.2f}'.format(x),\"acc_total\":lambda x: '{:,.2%}'.format(x),\"total\":lambda x: '{:,.2%}'.format(x)}).render()+             \"</td></tr></table>\")\n",
    "\n",
    "        if not show_queryplan_diff:\n",
    "            return \"\\n\".join(output)\n",
    "        \n",
    "        print(\"hot stage\")\n",
    "\n",
    "        loperators=appals.getOperatorCount()\n",
    "        roperators=appals2.getOperatorCount()\n",
    "        loperators_rowcnt=appals.get_metric_output_rowcnt()\n",
    "        roperators_rowcnt=appals2.get_metric_output_rowcnt()\n",
    "        \n",
    "        def show_query_diff(queryid, always_show=True):\n",
    "            lops=pandas.DataFrame(loperators[queryid])\n",
    "            lops.columns=['calls_l']\n",
    "            lops=lops.loc[lops['calls_l'] >0]\n",
    "\n",
    "            rops=pandas.DataFrame(roperators[queryid])\n",
    "            rops.columns=[\"calls_r\"]\n",
    "            rops=rops.loc[rops['calls_r'] >0]\n",
    "            lops_row=pandas.DataFrame(loperators_rowcnt[queryid])\n",
    "            lops_row.columns=[\"rows_l\"]\n",
    "            lops_row=lops_row.loc[lops_row['rows_l'] >0]\n",
    "\n",
    "            rops_row=pandas.DataFrame(roperators_rowcnt[queryid])\n",
    "            rops_row.columns=[\"rows_r\"]\n",
    "            rops_row=rops_row.loc[rops_row['rows_r'] >0]\n",
    "\n",
    "            opscmp=pandas.merge(pandas.merge(pandas.merge(lops,rops,how=\"outer\",left_index=True,right_index=True),lops_row,how=\"outer\",left_index=True,right_index=True),rops_row,how=\"outer\",left_index=True,right_index=True)\n",
    "            opscmp=opscmp.fillna(\"\")\n",
    "            \n",
    "            def set_bk_color_opscmp(x):\n",
    "                calls_l= 0 if x['calls_l']==\"\" else x['calls_l']\n",
    "                calls_r= 0 if x['calls_r']==\"\" else x['calls_r']\n",
    "                rows_l= 0 if x['rows_l']==\"\" else x['rows_l']\n",
    "                rows_r= 0 if x['rows_r']==\"\" else x['rows_r']\n",
    "\n",
    "                if calls_l > calls_r or rows_l > rows_r:\n",
    "                    return ['background-color:#eb6b34']*4\n",
    "                if calls_l < calls_r or rows_l < rows_r:\n",
    "                    return ['background-color:#8ad158']*4\n",
    "                return ['color:#dbd4d0']*4\n",
    "\n",
    "            if always_show or not (opscmp[\"rows_l\"].equals(opscmp[\"rows_r\"]) and opscmp[\"calls_l\"].equals(opscmp[\"calls_r\"])):\n",
    "                print(f\"query  {queryid}  queryplan diff \")\n",
    "                if not always_show:\n",
    "                    output.append(f\"<p><font size=4 color=red>query{queryid} is different</font></p>\")\n",
    "                output.append(opscmp.style.apply(set_bk_color_opscmp,axis=1).render())\n",
    "\n",
    "                planl=appals.get_query_plan(queryid=queryid,show_plan_only=True,plot=False)\n",
    "                planr=appals2.get_query_plan(queryid=queryid,show_plan_only=True,plot=False)\n",
    "                output.append(\"<table><tr><td>\"+planl+\"</td><td>\"+planr+\"</td></tr></table>\")\n",
    "\n",
    "        outputx=df1['output rows']\n",
    "        runtimex = df1['runtime']\n",
    "        for x in outputx.index:\n",
    "            if runtimex[x][0]/runtimex[x][1]<0.95 or runtimex[x][0]/runtimex[x][1]>1.05:\n",
    "                output.append(f\"<p><font size=4 color=red>query{x} is different,{lrun} time: {df1['runtime'][x][0]}, {rrun} time: {df1['runtime'][x][1]}</font></p>\")\n",
    "                if queryids is not None and x not in queryids:\n",
    "                    print(\"query plan skipped\")\n",
    "                    continue\n",
    "                try:\n",
    "                    show_query_diff(x, True)\n",
    "                except:\n",
    "                    print(\" query diff error\")\n",
    "            else:\n",
    "                try:\n",
    "                    show_query_diff(x, False)\n",
    "                except:\n",
    "                    print(\" query diff error\")\n",
    "                \n",
    "        return \"\\n\".join(output)\n",
    "                              \n",
    "\n",
    "                              \n",
    "    def show_queryplan_diff(app2, queryid,**kwargs):\n",
    "        lbasedir=kwargs.get(\"basedir\",app2.basedir)\n",
    "        r_appid=kwargs.get(\"r_appid\",app2.appid)\n",
    "        \n",
    "        app=kwargs.get(\"rapp\",Application_Run(r_appid,basedir=lbasedir))\n",
    "\n",
    "        appals=app.analysis[\"app\"][\"als\"]\n",
    "        appals2=app2.analysis[\"app\"][\"als\"]\n",
    "\n",
    "        hotstagel=appals.get_hottest_stages(plot=False)\n",
    "        hotstager=appals2.get_hottest_stages(plot=False)\n",
    "        hotstagel.style.format(lambda x: '''{:,.2f}'''.format(x))\n",
    "\n",
    "        loperators=appals.getOperatorCount()\n",
    "        roperators=appals2.getOperatorCount()\n",
    "        loperators_rowcnt=appals.get_metric_output_rowcnt()\n",
    "        roperators_rowcnt=appals2.get_metric_output_rowcnt()\n",
    "\n",
    "        lrun=app.appid\n",
    "        rrun=app2.appid\n",
    "\n",
    "        output=[]\n",
    "\n",
    "        def show_query_diff(queryid):\n",
    "            lops=pandas.DataFrame(loperators[queryid])\n",
    "            lops.columns=['calls_l']\n",
    "            lops=lops.loc[lops['calls_l'] >0]\n",
    "\n",
    "            rops=pandas.DataFrame(roperators[queryid])\n",
    "            rops.columns=[\"calls_r\"]\n",
    "            rops=rops.loc[rops['calls_r'] >0]\n",
    "            lops_row=pandas.DataFrame(loperators_rowcnt[queryid])\n",
    "            lops_row.columns=[\"rows_l\"]\n",
    "            lops_row=lops_row.loc[lops_row['rows_l'] >0]\n",
    "\n",
    "            rops_row=pandas.DataFrame(roperators_rowcnt[queryid])\n",
    "            rops_row.columns=[\"rows_r\"]\n",
    "            rops_row=rops_row.loc[rops_row['rows_r'] >0]\n",
    "\n",
    "            opscmp=pandas.merge(pandas.merge(pandas.merge(lops,rops,how=\"outer\",left_index=True,right_index=True),lops_row,how=\"outer\",left_index=True,right_index=True),rops_row,how=\"outer\",left_index=True,right_index=True)\n",
    "            opscmp=opscmp.fillna(\"\")\n",
    "\n",
    "            def set_bk_color_opscmp(x):\n",
    "                calls_l= 0 if x['calls_l']==\"\" else x['calls_l']\n",
    "                calls_r= 0 if x['calls_r']==\"\" else x['calls_r']\n",
    "                rows_l= 0 if x['rows_l']==\"\" else x['rows_l']\n",
    "                rows_r= 0 if x['rows_r']==\"\" else x['rows_r']\n",
    "\n",
    "                if calls_l > calls_r or rows_l > rows_r:\n",
    "                    return ['background-color:#eb6b34']*4\n",
    "                if calls_l < calls_r or rows_l < rows_r:\n",
    "                    return ['background-color:#8ad158']*4\n",
    "                return ['color:#dbd4d0']*4\n",
    "\n",
    "            output.append(opscmp.style.apply(set_bk_color_opscmp,axis=1).render())\n",
    "\n",
    "            planl=appals.get_query_plan(queryid=queryid,show_plan_only=True,plot=False)\n",
    "            planr=appals2.get_query_plan(queryid=queryid,show_plan_only=True,plot=False)\n",
    "            output.append(\"<table><tr><td>\"+planl+\"</td><td>\"+planr+\"</td></tr></table>\")\n",
    "\n",
    "        x=queryid\n",
    "        print(\"query \",x,\" queryplan diff \")\n",
    "        #output.append(f\"<p><font size=4 color=red>query{x} is different,{lrun} time: {df1['runtime'][x][0]}, {rrun} time: {df1['runtime'][x][1]}</font></p>\")\n",
    "        show_query_diff(x)\n",
    "        display(HTML(\"\\n\".join(output)))\n",
    "        return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# MISC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def reduce_metric(pdrst,slave_id,metric,core,agg_func):\n",
    "    pdrst['rst']=pdrst.apply(lambda x:x['app_id'].get_reduce_metric(slave_id,metric,core,agg_func), axis=1)\n",
    "    for l in agg_func:\n",
    "        pdrst[get_alias_name(metric,l)]=pdrst.apply(lambda x:x['rst'].iloc[0][get_alias_name(metric,l)],axis=1)\n",
    "    return pdrst.drop(columns=['rst'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def cvt_number(n):\n",
    "    try:\n",
    "        if str(n).isdigit():\n",
    "            return f'{n:,}'\n",
    "        else:\n",
    "            return f'{round(float(n),2):,}'\n",
    "    except ValueError:\n",
    "        return n\n",
    "\n",
    "def parse_changelog(changelog):\n",
    "    out=[]\n",
    "    if fs.exists(changelog):\n",
    "        with fs.open(changelog) as f:\n",
    "            for l in f.readlines():\n",
    "                l = l.decode('utf-8')\n",
    "                if l.startswith(\"commit\"):\n",
    "                    out.append(re.sub(r\"commit +(.+)\",r\"<font color=#BDCA57>commit </font><font color=#23C2BF>\\1</font>\",l))\n",
    "                elif l.startswith(\"Author\"):\n",
    "                    out.append(re.sub(r\"Author: +([^<]+) <(.+)>\",r\"<font color=#BDCA57>Author: </font><font color=#C02866>\\1</font> <<font color=#BC0DBD>\\2</font>> \",l))\n",
    "                elif l.startswith(\"Date\"):\n",
    "                    out.append(re.sub(r\"Date: +(\\d\\d\\d\\d-\\d\\d-\\d\\d)\",r\"<font color=#BDCA57>Author: </font>\\1\",l))\n",
    "                else:\n",
    "                    out.append(l)\n",
    "    else:\n",
    "        out.append(f'{os.path.basename(changelog)} not found!')\n",
    "    return out\n",
    "\n",
    "def generate_query_diff(name, comp_name, query_time_file, comp_query_time_file):\n",
    "    result = []\n",
    "    if fs.exists(query_time_file) and fs.exists(comp_query_time_file):\n",
    "        result.append(['query', name, comp_name, 'difference', 'percentage'])\n",
    "        \n",
    "        qtimes = {}\n",
    "        comp_qtimes = {}\n",
    "        with fs.open(query_time_file) as f:\n",
    "            qtimes = json.loads(f.read().decode('ascii'))\n",
    "        with fs.open(comp_query_time_file) as f:\n",
    "            comp_qtimes = json.loads(f.read().decode('ascii'))\n",
    "        \n",
    "        query_ids = sorted(qtimes.keys(), key=lambda x: str(len(x))+x if x[-1] != 'a' and x[-1] != 'b' else str(len(x)-1) + x)\n",
    "        \n",
    "        if len(comp_qtimes) != len(qtimes):\n",
    "            raise Exception('Number of queries mismatch!')\n",
    "        \n",
    "        query_ids.append('total')\n",
    "        qtimes['total'] = sum([float(i) for i in qtimes.values()])\n",
    "        comp_qtimes['total'] = sum([float(i) for i in comp_qtimes.values()])\n",
    "        \n",
    "        for q in query_ids:\n",
    "            t1 = qtimes.get(q)\n",
    "            t2 = comp_qtimes.get(q)\n",
    "            delta = str(\"{:.2f}\".format(float(t2) - float(t1)))\n",
    "            perc = str(\"{:.2f}\".format((float(t2) / float(t1)) * 100)) + '%'\n",
    "            result.append([q, str(t1), str(t2), delta, perc])\n",
    "    return result\n",
    "\n",
    "def append_summary(appid, base_dir, name, comp_appid, comp_base_dir, comp_name, baseline_appid, baseline_base_dir, statsall, output):\n",
    "    with open(output,\"a\") as linkfile:\n",
    "\n",
    "        difftable=''' <table border=\"1\" cellpadding=\"0\" cellspacing=\"0\">\n",
    "                            <tbody>'''\n",
    "        for k,v in statsall.items():\n",
    "            difftable+=f'''\n",
    "                <tr>\n",
    "                <td>{k}</td>\n",
    "                <td>{cvt_number(v)}</td>\n",
    "                </tr>'''\n",
    "        difftable+='''\n",
    "            </tbody>\n",
    "        </table>\\n'''\n",
    "        linkfile.write(difftable)\n",
    "        linkfile.write(\"\\n<br><hr/>\\n\")\n",
    "        \n",
    "        linkfile.write(\"\\n<font color=blue> gluten gitlog in last 2 days</font><br>\\n\")\n",
    "        out=parse_changelog(os.path.join('/', base_dir, appid, 'changelog_gluten'))\n",
    "        linkfile.write(\"<br>\".join(out))\n",
    "        linkfile.write(\"\\n<br><hr/>\\n\")\n",
    "        \n",
    "        linkfile.write(\"\\n<font color=blue> velox gitlog in last 2 days</font><br>\\n\")\n",
    "        out=parse_changelog(os.path.join('/', base_dir, appid, 'changelog_velox'))\n",
    "        linkfile.write(\"<br>\".join(out))\n",
    "        linkfile.write(\"\\n<br><hr/>\\n\")\n",
    "        \n",
    "        linkfile.write('''<div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-OutputArea-output \" data-mime-type=\"text/html\">\\n''')\n",
    "        \n",
    "        def append_query_diff(their_appid, their_base_dir, their_name):\n",
    "            query_diff=generate_query_diff(name, their_name, os.path.join('/', base_dir, appid, 'query_time.json'), os.path.join('/', their_base_dir, their_appid, 'query_time.json'))\n",
    "            if query_diff:\n",
    "                difftable='''\n",
    "                <table border=\"1\" cellpadding=\"0\" cellspacing=\"0\">\n",
    "                    <tbody>'''\n",
    "                for l in query_diff:\n",
    "                    difftable+='''\n",
    "                        <tr>'''\n",
    "                    base=0\n",
    "                    pr=0\n",
    "                    if re.match(r\"[0-9.]+\",l[1]):\n",
    "                        base=float(l[1])\n",
    "                        l[1]=\"{:.2f}\".format(base)\n",
    "                    if re.match(r\"[0-9.]+\",l[2]):\n",
    "                        pr=float(l[2])\n",
    "                        l[2]=\"{:.2f}\".format(pr)\n",
    "\n",
    "                    for d in l:\n",
    "                        color='#000000'\n",
    "                        if base > pr:\n",
    "                            color='#6F9915'\n",
    "                        elif base < pr:\n",
    "                            color='#F92663'\n",
    "                        difftable += f'''\n",
    "                        <td><font color={color}>{d}</font></td>'''\n",
    "\n",
    "                    difftable+='''\n",
    "                        </tr>'''\n",
    "\n",
    "                difftable+='''\n",
    "                    </tbody>\n",
    "                </table>'''\n",
    "                linkfile.write(difftable)\n",
    "                linkfile.write(\"\\n<br><hr/>\\n\")\n",
    "                # return percentage\n",
    "                return query_diff[-1][-1]\n",
    "            return ''\n",
    "\n",
    "        baseline_perc = ''\n",
    "        if comp_appid:\n",
    "            append_query_diff(comp_appid, comp_base_dir, comp_name)\n",
    "        if baseline_appid:\n",
    "            baseline_perc = append_query_diff(baseline_appid, baseline_base_dir, 'Vanilla Spark')\n",
    "\n",
    "        linkfile.write(\"</div>\")\n",
    "        \n",
    "        return baseline_perc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def generate_email_body_title(appid, base_dir, name, comp_appid, comp_base_dir, comp_name, baseline_appid, baseline_base_dir, notebook, notebook_html, traceview, stats, summary, pr=''):\n",
    "    statsall=collections.OrderedDict()\n",
    "    for k,v in stats.items():\n",
    "        statsall[k]=v\n",
    "    for k,v in summary.to_dict()[appals.appid].items():\n",
    "        statsall[k]=v\n",
    "    \n",
    "    pr_link=''\n",
    "    if pr:\n",
    "        pr_link=f'https://github.com/apache/incubator-gluten/pull/{pr}'\n",
    "        title=!wget --quiet -O - $pr_link | sed -n -e 's!.*<title>\\(.*\\)</title>.*!\\1!p'\n",
    "        if not title:\n",
    "            raise Exception(f'Failed to fetch PR link: {pr_link}')\n",
    "        pr_link=f'pr link: <a href=\"{pr_link}\">{title[0]}</a><br>'\n",
    "    \n",
    "    output=f'/tmp/{appid}.html'\n",
    "    with open(output, 'w+') as f:\n",
    "        f.writelines(f'''\n",
    "<font style=\"font-family: Courier New\"\">\n",
    "history event: <a href=\"http://{local_ip}:18080/tmp/sparkEventLog/{appid}/jobs/\">http://{local_ip}:18080/tmp/sparkEventLog/{appid}/jobs/</a><br>\n",
    "notebook: <a href=\"http://{local_ip}:8889/notebooks/{base_dir}/{notebook}\">http://{local_ip}:8889/notebooks/{base_dir}/{notebook}</a><br>\n",
    "notebook html: <a href=\"http://{local_ip}:8889/view/{base_dir}/{notebook_html}\">http://{local_ip}:8889/view/{base_dir}/{notebook_html}</a><br>\n",
    "traceview: <a href=\"{traceview}\">{traceview}</a><br>\n",
    "{pr_link}\n",
    "</font><hr/>''')\n",
    "    baseline_perc = append_summary(appid, base_dir, name, comp_appid, comp_base_dir, comp_name, baseline_appid, baseline_base_dir, statsall, output)\n",
    "    \n",
    "    title_prefix = f\"[ {datetime.now().strftime('%m_%d_%Y')} ]\" if not pr else f\"[ PR {pr} ]\"\n",
    "    title = f'{title_prefix} {name} {appid} {baseline_perc}'\n",
    "    return output,title"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# TPCDS query map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "m='''1\tq01\n",
    "    2\tq02\n",
    "    3\tq03\n",
    "    4\tq04\n",
    "    5\tq05\n",
    "    6\tq06\n",
    "    7\tq07\n",
    "    8\tq08\n",
    "    9\tq09\n",
    "    10\tq10\n",
    "    11\tq11\n",
    "    12\tq12\n",
    "    13\tq13\n",
    "    14\tq14a\n",
    "    15\tq14b\n",
    "    16\tq15\n",
    "    17\tq16\n",
    "    18\tq17\n",
    "    19\tq18\n",
    "    20\tq19\n",
    "    21\tq20\n",
    "    22\tq21\n",
    "    23\tq22\n",
    "    24\tq23a\n",
    "    25\tq23b\n",
    "    26\tq24a\n",
    "    27\tq24b\n",
    "    28\tq25\n",
    "    29\tq26\n",
    "    30\tq27\n",
    "    31\tq28\n",
    "    32\tq29\n",
    "    33\tq30\n",
    "    34\tq31\n",
    "    35\tq32\n",
    "    36\tq33\n",
    "    37\tq34\n",
    "    38\tq35\n",
    "    39\tq36\n",
    "    40\tq37\n",
    "    41\tq38\n",
    "    42\tq39a\n",
    "    43\tq39b\n",
    "    44\tq40\n",
    "    45\tq41\n",
    "    46\tq42\n",
    "    47\tq43\n",
    "    48\tq44\n",
    "    49\tq45\n",
    "    50\tq46\n",
    "    51\tq47\n",
    "    52\tq48\n",
    "    53\tq49\n",
    "    54\tq50\n",
    "    55\tq51\n",
    "    56\tq52\n",
    "    57\tq53\n",
    "    58\tq54\n",
    "    59\tq55\n",
    "    60\tq56\n",
    "    61\tq57\n",
    "    62\tq58\n",
    "    63\tq59\n",
    "    64\tq60\n",
    "    65\tq61\n",
    "    66\tq62\n",
    "    67\tq63\n",
    "    68\tq64\n",
    "    69\tq65\n",
    "    70\tq66\n",
    "    71\tq67\n",
    "    72\tq68\n",
    "    73\tq69\n",
    "    74\tq70\n",
    "    75\tq71\n",
    "    76\tq72\n",
    "    77\tq73\n",
    "    78\tq74\n",
    "    79\tq75\n",
    "    80\tq76\n",
    "    81\tq77\n",
    "    82\tq78\n",
    "    83\tq79\n",
    "    84\tq80\n",
    "    85\tq81\n",
    "    86\tq82\n",
    "    87\tq83\n",
    "    88\tq84\n",
    "    89\tq85\n",
    "    90\tq86\n",
    "    91\tq87\n",
    "    92\tq88\n",
    "    93\tq89\n",
    "    94\tq90\n",
    "    95\tq91\n",
    "    96\tq92\n",
    "    97\tq93\n",
    "    98\tq94\n",
    "    99\tq95\n",
    "    100\tq96\n",
    "    101\tq97\n",
    "    102\tq98\n",
    "    103\tq99'''.split(\"\\n\")\n",
    "tpcds_query_map=[l.strip().split(\"\\t\") for l in m]\n",
    "tpcds_query_map={int(l[0]):l[1] for l in tpcds_query_map}"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "nbTranslate": {
   "displayLangs": [
    "*"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "fr",
   "useGoogleTranslate": true
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": false,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "298.275px",
    "left": "1180px",
    "top": "317.125px",
    "width": "332px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
